Conversion pipeline: When removing the first image, also remove the html file the image is found in, if that file has no other content

2025-07-09 03:04:10 -04:00 · 2012-07-01 10:03:26 +05:30 · 2012-07-01 10:03:26 +05:30 · b128849723
commit b128849723
parent 2b4307fc7f
1 changed files with 15 additions and 1 deletions
--- a/src/calibre/ebooks/oeb/transforms/jacket.py
+++ b/src/calibre/ebooks/oeb/transforms/jacket.py
@ -13,7 +13,7 @@ from lxml import etree

 from calibre import guess_type, strftime
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
-from calibre.ebooks.oeb.base import XPath, XHTML_NS, XHTML
+from calibre.ebooks.oeb.base import XPath, XHTML_NS, XHTML, xml2text, urldefrag
 from calibre.library.comments import comments_to_html
 from calibre.utils.date import is_date_undefined
 from calibre.ebooks.chardet import strip_encoding_declarations
@ -41,11 +41,25 @@ class Jacket(object):
        return removed

    def remove_first_image(self):
+        deleted_item = None
        for item in self.oeb.spine:
            removed = self.remove_images(item)
            if removed > 0:
                self.log('Removed first image')
+                body = XPath('//h:body')(item.data)
+                if body:
+                    raw = xml2text(body[0]).strip()
+                    imgs = XPath('//h:img|//svg:svg')(item.data)
+                    if not raw and not imgs:
+                        self.log('Removing %s as it has no content'%item.href)
+                        self.oeb.manifest.remove(item)
+                        deleted_item = item
                break
+        if deleted_item is not None:
+            for item in list(self.oeb.toc):
+                href = urldefrag(item.href)[0]
+                if href == deleted_item.href:
+                    self.oeb.toc.remove(item)

    def insert_metadata(self, mi):
        self.log('Inserting metadata into book...')