EPUB metadata: Extract the cover image from the html it is embededd in if possible, instead of rendering the html. Removes the white margins on covers and speeds up cover extraction

2025-08-30 23:00:21 -04:00 · 2010-02-07 12:49:54 -07:00 · 2010-02-07 12:49:54 -07:00 · a1365513dd
commit a1365513dd
parent 6a24c74e50
1 changed files with 18 additions and 0 deletions
--- a/src/calibre/ebooks/init.py
+++ b/src/calibre/ebooks/init.py
@ -70,6 +70,19 @@ def extract_cover_from_embedded_svg(html, base, log):
        if href and os.access(path, os.R_OK):
            return open(path, 'rb').read()

+def extract_calibre_cover(raw, base, log):
+    from calibre.ebooks.BeautifulSoup import BeautifulSoup
+    soup = BeautifulSoup(raw)
+    matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span',
+        'font', 'br'])
+    images = soup.findAll('img')
+    if matches is None and len(images) == 1 and \
+            images[0].get('alt', '')=='cover':
+        img = images[0]
+        img = os.path.join(base, *img['src'].split('/'))
+        if os.path.exists(img):
+            return open(img, 'rb').read()
+
 def render_html_svg_workaround(path_to_html, log, width=590, height=750):
    from calibre.ebooks.oeb.base import SVG_NS
    raw = open(path_to_html, 'rb').read()
@ -80,6 +93,11 @@ def render_html_svg_workaround(path_to_html, log, width=590, height=750):
                   os.path.dirname(path_to_html), log)
        except:
            pass
+    if data is None:
+        try:
+            data = extract_calibre_cover(raw, os.path.dirname(path_to_html), log)
+        except:
+            pass
    if data is None:
        renderer = render_html(path_to_html, width, height)
        data = getattr(renderer, 'data', None)