Fix #1695 (Cover Pages in Penguin samples)

2025-07-08 18:54:09 -04:00 · 2009-01-26 00:40:17 -08:00 · 2009-01-26 00:40:17 -08:00 · 61e22407f8
commit 61e22407f8
parent 4ce35e4fd7
3 changed files with 22 additions and 14 deletions
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -194,6 +194,8 @@ class HTMLProcessor(Processor, Rationalizer):
            if not tag.text and not tag.get('src', False):
                tag.getparent().remove(tag)
                
+        
+    
    def save(self):
        for meta in list(self.root.xpath('//meta')):
            meta.getparent().remove(meta)
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -417,39 +417,44 @@ class Parser(PreProcessor, LoggingInterface):
        self.level = self.htmlfile.level
        for f in self.htmlfiles:
            name = os.path.basename(f.path)
+            name = os.path.splitext(name)[0] + '.xhtml'
            if name in self.htmlfile_map.values():
                name = os.path.splitext(name)[0] + '_cr_%d'%save_counter + os.path.splitext(name)[1]
                save_counter += 1
            self.htmlfile_map[f.path] = name
        
        self.parse_html()
+        # Handle <image> tags inside embedded <svg>
+        # At least one source of EPUB files (Penguin) uses xlink:href
+        # without declaring the xlink namespace
+        for image in self.root.xpath('//image'): 
+            for attr in image.attrib.keys():
+                if attr.endswith(':href'):
+                    nhref = self.rewrite_links(image.get(attr))
+                    image.set(attr, nhref)
+        
        self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
        for bad in ('xmlns', 'lang', 'xml:lang'): # lxml also adds these attributes for XHTML documents, leading to duplicates
            if self.root.get(bad, None) is not None:
                self.root.attrib.pop(bad)
        
+        
+        
    def save_path(self):
        return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path])
    
-    def declare_xhtml_namespace(self, match):
-        if not match.group('raw'):
-            return '<html xmlns="http://www.w3.org/1999/xhtml">'
-        raw = match.group('raw')
-        m = re.search(r'(?i)xmlns\s*=\s*[\'"](?P<uri>[^"\']*)[\'"]', raw)
-        if not m:
-            return '<html xmlns="http://www.w3.org/1999/xhtml" %s>'%raw
-        else:
-            return  match.group().replace(m.group('uri'), "http://www.w3.org/1999/xhtml")
-    
    def save(self):
        '''
        Save processed HTML into the content directory.
        Should be called after all HTML processing is finished.
        '''
-        ans = tostring(self.root, pretty_print=self.opts.pretty_print)
-        ans = re.sub(r'(?i)<\s*html(?P<raw>\s+[^>]*){0,1}>', self.declare_xhtml_namespace, ans[:1000]) + ans[1000:]
-        ans = re.compile(r'<head>', re.IGNORECASE).sub('<head>\n\t<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n', ans[:1000])+ans[1000:]
+        self.root.set('xmlns', 'http://www.w3.org/1999/xhtml')
+        self.root.set('xmlns:xlink', 'http://www.w3.org/1999/xlink')
+        for svg in self.root.xpath('//svg'):
+            svg.set('xmlns', 'http://www.w3.org/2000/svg')
        
+        ans = tostring(self.root, pretty_print=self.opts.pretty_print)
+        ans = re.compile(r'<head>', re.IGNORECASE).sub('<head>\n\t<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n', ans[:1000])+ans[1000:]
        with open(self.save_path(), 'wb') as f:
            f.write(ans)
            return f.name
--- a/src/calibre/ebooks/oeb/transforms/rasterize.py
+++ b/src/calibre/ebooks/oeb/transforms/rasterize.py
@ -46,9 +46,10 @@ class SVGRasterizer(object):
        data = QByteArray(xml2str(elem))
        svg = QSvgRenderer(data)
        size = svg.defaultSize()
+        view_box = elem.get('viewBox', elem.get('viewbox', None))
        if size.width() == 100 and size.height() == 100 \
-           and 'viewBox' in elem.attrib:
-            box = [float(x) for x in elem.attrib['viewBox'].split()]
+           and view_box is not None:
+            box = [float(x) for x in view_box.split()]
            size.setWidth(box[2] - box[0])
            size.setHeight(box[3] - box[1])
        if width or height: