Fix #1695 (Cover Pages in Penguin samples)

This commit is contained in:
Kovid Goyal 2009-01-26 00:40:17 -08:00
parent 4ce35e4fd7
commit 61e22407f8
3 changed files with 22 additions and 14 deletions

View File

@ -194,6 +194,8 @@ class HTMLProcessor(Processor, Rationalizer):
if not tag.text and not tag.get('src', False):
tag.getparent().remove(tag)
def save(self):
for meta in list(self.root.xpath('//meta')):
meta.getparent().remove(meta)

View File

@ -417,39 +417,44 @@ class Parser(PreProcessor, LoggingInterface):
self.level = self.htmlfile.level
for f in self.htmlfiles:
name = os.path.basename(f.path)
name = os.path.splitext(name)[0] + '.xhtml'
if name in self.htmlfile_map.values():
name = os.path.splitext(name)[0] + '_cr_%d'%save_counter + os.path.splitext(name)[1]
save_counter += 1
self.htmlfile_map[f.path] = name
self.parse_html()
# Handle <image> tags inside embedded <svg>
# At least one source of EPUB files (Penguin) uses xlink:href
# without declaring the xlink namespace
for image in self.root.xpath('//image'):
for attr in image.attrib.keys():
if attr.endswith(':href'):
nhref = self.rewrite_links(image.get(attr))
image.set(attr, nhref)
self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
for bad in ('xmlns', 'lang', 'xml:lang'): # lxml also adds these attributes for XHTML documents, leading to duplicates
if self.root.get(bad, None) is not None:
self.root.attrib.pop(bad)
def save_path(self):
return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path])
def declare_xhtml_namespace(self, match):
if not match.group('raw'):
return '<html xmlns="http://www.w3.org/1999/xhtml">'
raw = match.group('raw')
m = re.search(r'(?i)xmlns\s*=\s*[\'"](?P<uri>[^"\']*)[\'"]', raw)
if not m:
return '<html xmlns="http://www.w3.org/1999/xhtml" %s>'%raw
else:
return match.group().replace(m.group('uri'), "http://www.w3.org/1999/xhtml")
def save(self):
'''
Save processed HTML into the content directory.
Should be called after all HTML processing is finished.
'''
ans = tostring(self.root, pretty_print=self.opts.pretty_print)
ans = re.sub(r'(?i)<\s*html(?P<raw>\s+[^>]*){0,1}>', self.declare_xhtml_namespace, ans[:1000]) + ans[1000:]
ans = re.compile(r'<head>', re.IGNORECASE).sub('<head>\n\t<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n', ans[:1000])+ans[1000:]
self.root.set('xmlns', 'http://www.w3.org/1999/xhtml')
self.root.set('xmlns:xlink', 'http://www.w3.org/1999/xlink')
for svg in self.root.xpath('//svg'):
svg.set('xmlns', 'http://www.w3.org/2000/svg')
ans = tostring(self.root, pretty_print=self.opts.pretty_print)
ans = re.compile(r'<head>', re.IGNORECASE).sub('<head>\n\t<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n', ans[:1000])+ans[1000:]
with open(self.save_path(), 'wb') as f:
f.write(ans)
return f.name

View File

@ -46,9 +46,10 @@ class SVGRasterizer(object):
data = QByteArray(xml2str(elem))
svg = QSvgRenderer(data)
size = svg.defaultSize()
view_box = elem.get('viewBox', elem.get('viewbox', None))
if size.width() == 100 and size.height() == 100 \
and 'viewBox' in elem.attrib:
box = [float(x) for x in elem.attrib['viewBox'].split()]
and view_box is not None:
box = [float(x) for x in view_box.split()]
size.setWidth(box[2] - box[0])
size.setHeight(box[3] - box[1])
if width or height: