HTML Input: Handle HTML fragments more gracefully. Fixes #4854 (Imported HTML fragments get converted to ZIPs containing no HTML)

2025-06-23 15:30:45 -04:00 · 2010-02-10 15:31:11 -07:00 · 2010-02-10 15:31:11 -07:00 · b33bfe2e43
commit b33bfe2e43
parent a0671a64d4
2 changed files with 5 additions and 3 deletions
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -111,7 +111,7 @@ class HTMLFile(object):
                raise IOError(msg)
            raise IgnoreFile(msg, err.errno)
-        self.is_binary = not bool(self.HTML_PAT.search(src[:4096]))
+        self.is_binary = level > 0 and not bool(self.HTML_PAT.search(src[:4096]))
        if not self.is_binary:
            if encoding is None:
                encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -851,8 +851,10 @@ class Manifest(object):
                    self.oeb.log.warn('File %r appears to be a HTML fragment'%self.href)
                    nroot = etree.fromstring('<html><body/></html>')
                    parent = nroot[0]
-                for child in list(data):
+                for child in list(data.iter()):
-                    child.getparent().remove(child)
+                    oparent = child.getparent()
                    if oparent is not None:
                        oparent.remove(child)
                    parent.append(child)
                data = nroot