Conversion pipeline: When removing the first image, also remove the html file the image is found in, if that file has no other content

This commit is contained in:
Kovid Goyal 2012-07-01 10:03:26 +05:30
parent 2b4307fc7f
commit b128849723

View File

@ -13,7 +13,7 @@ from lxml import etree
from calibre import guess_type, strftime
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.oeb.base import XPath, XHTML_NS, XHTML
from calibre.ebooks.oeb.base import XPath, XHTML_NS, XHTML, xml2text, urldefrag
from calibre.library.comments import comments_to_html
from calibre.utils.date import is_date_undefined
from calibre.ebooks.chardet import strip_encoding_declarations
@ -41,11 +41,25 @@ class Jacket(object):
return removed
def remove_first_image(self):
deleted_item = None
for item in self.oeb.spine:
removed = self.remove_images(item)
if removed > 0:
self.log('Removed first image')
body = XPath('//h:body')(item.data)
if body:
raw = xml2text(body[0]).strip()
imgs = XPath('//h:img|//svg:svg')(item.data)
if not raw and not imgs:
self.log('Removing %s as it has no content'%item.href)
self.oeb.manifest.remove(item)
deleted_item = item
break
if deleted_item is not None:
for item in list(self.oeb.toc):
href = urldefrag(item.href)[0]
if href == deleted_item.href:
self.oeb.toc.remove(item)
def insert_metadata(self, mi):
self.log('Inserting metadata into book...')