From 61e22407f89b83ff0d69b9ae84223daa07386771 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 26 Jan 2009 00:40:17 -0800 Subject: [PATCH] Fix #1695 (Cover Pages in Penguin samples) --- src/calibre/ebooks/epub/from_html.py | 2 ++ src/calibre/ebooks/html.py | 29 +++++++++++-------- .../ebooks/oeb/transforms/rasterize.py | 5 ++-- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py index c358471f09..458fca152c 100644 --- a/src/calibre/ebooks/epub/from_html.py +++ b/src/calibre/ebooks/epub/from_html.py @@ -193,6 +193,8 @@ class HTMLProcessor(Processor, Rationalizer): for tag in self.root.xpath('//script'): if not tag.text and not tag.get('src', False): tag.getparent().remove(tag) + + def save(self): for meta in list(self.root.xpath('//meta')): diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index f6659f9f51..32601320d4 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -417,39 +417,44 @@ class Parser(PreProcessor, LoggingInterface): self.level = self.htmlfile.level for f in self.htmlfiles: name = os.path.basename(f.path) + name = os.path.splitext(name)[0] + '.xhtml' if name in self.htmlfile_map.values(): name = os.path.splitext(name)[0] + '_cr_%d'%save_counter + os.path.splitext(name)[1] save_counter += 1 self.htmlfile_map[f.path] = name self.parse_html() + # Handle tags inside embedded + # At least one source of EPUB files (Penguin) uses xlink:href + # without declaring the xlink namespace + for image in self.root.xpath('//image'): + for attr in image.attrib.keys(): + if attr.endswith(':href'): + nhref = self.rewrite_links(image.get(attr)) + image.set(attr, nhref) + self.root.rewrite_links(self.rewrite_links, resolve_base_href=False) for bad in ('xmlns', 'lang', 'xml:lang'): # lxml also adds these attributes for XHTML documents, leading to duplicates if self.root.get(bad, None) is not None: self.root.attrib.pop(bad) + + def save_path(self): return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]) - def declare_xhtml_namespace(self, match): - if not match.group('raw'): - return '' - raw = match.group('raw') - m = re.search(r'(?i)xmlns\s*=\s*[\'"](?P[^"\']*)[\'"]', raw) - if not m: - return ''%raw - else: - return match.group().replace(m.group('uri'), "http://www.w3.org/1999/xhtml") - def save(self): ''' Save processed HTML into the content directory. Should be called after all HTML processing is finished. ''' + self.root.set('xmlns', 'http://www.w3.org/1999/xhtml') + self.root.set('xmlns:xlink', 'http://www.w3.org/1999/xlink') + for svg in self.root.xpath('//svg'): + svg.set('xmlns', 'http://www.w3.org/2000/svg') + ans = tostring(self.root, pretty_print=self.opts.pretty_print) - ans = re.sub(r'(?i)<\s*html(?P\s+[^>]*){0,1}>', self.declare_xhtml_namespace, ans[:1000]) + ans[1000:] ans = re.compile(r'', re.IGNORECASE).sub('\n\t\n', ans[:1000])+ans[1000:] - with open(self.save_path(), 'wb') as f: f.write(ans) return f.name diff --git a/src/calibre/ebooks/oeb/transforms/rasterize.py b/src/calibre/ebooks/oeb/transforms/rasterize.py index 69f1d0d133..97d73d3dcb 100644 --- a/src/calibre/ebooks/oeb/transforms/rasterize.py +++ b/src/calibre/ebooks/oeb/transforms/rasterize.py @@ -46,9 +46,10 @@ class SVGRasterizer(object): data = QByteArray(xml2str(elem)) svg = QSvgRenderer(data) size = svg.defaultSize() + view_box = elem.get('viewBox', elem.get('viewbox', None)) if size.width() == 100 and size.height() == 100 \ - and 'viewBox' in elem.attrib: - box = [float(x) for x in elem.attrib['viewBox'].split()] + and view_box is not None: + box = [float(x) for x in view_box.split()] size.setWidth(box[2] - box[0]) size.setHeight(box[3] - box[1]) if width or height: