Fix handling of reflowable covers in EPUB input. Also try to extract svg embedded raster covers. Misc. minor fixes

This commit is contained in:
Kovid Goyal 2009-07-22 13:48:05 -06:00
parent a20d9fb169
commit d0e1fa2d90
8 changed files with 63 additions and 77 deletions

View File

@ -361,6 +361,8 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252'):
return '&'+ent+';'
if ent == 'apos':
return "'"
if ent == 'hellips':
ent = 'hellip'
if ent.startswith(u'#x'):
num = int(ent[2:], 16)
if encoding is None or num > 255:

View File

@ -57,6 +57,35 @@ class HTMLRenderer(object):
self.loop.exit(0)
def extract_cover_from_embedded_svg(html, base, log):
from lxml import etree
from calibre.ebooks.oeb.base import XPath, SVG, XLINK
root = etree.fromstring(html)
svg = XPath('//svg:svg')(root)
if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):
image = svg[0][0]
href = image.get(XLINK('href'), None)
path = os.path.join(base, *href.split('/'))
if href and os.access(path, os.R_OK):
return open(path, 'rb').read()
def render_html_svg_workaround(path_to_html, log, width=590, height=750):
from calibre.ebooks.oeb.base import SVG_NS
raw = open(path_to_html, 'rb').read()
data = None
if SVG_NS in raw:
try:
data = extract_cover_from_embedded_svg(raw,
os.path.dirname(path_to_html), log)
except:
pass
if data is None:
renderer = render_html(path_to_html, width, height)
data = getattr(renderer, 'data', None)
return data
def render_html(path_to_html, width=590, height=750):
from PyQt4.QtWebKit import QWebPage
from PyQt4.Qt import QEventLoop, QPalette, Qt, SIGNAL, QUrl, QSize

View File

@ -54,7 +54,7 @@ class EPUBInput(InputFormatPlugin):
return False
@classmethod
def rationalize_cover(self, opf):
def rationalize_cover(self, opf, log):
guide_cover, guide_elem = None, None
for guide_elem in opf.iterguide():
if guide_elem.get('type', '').lower() == 'cover':
@ -65,28 +65,37 @@ class EPUBInput(InputFormatPlugin):
spine = list(opf.iterspine())
if not spine:
return
# Check if the cover specified in the guide is also
# the first element in spine
idref = spine[0].get('idref', '')
manifest = list(opf.itermanifest())
if not manifest:
return
if manifest[0].get('id', False) != idref:
elem = [x for x in manifest if x.get('id', '') == idref]
if not elem or elem[0].get('href', None) != guide_cover:
return
log('Found HTML cover', guide_cover)
# Remove from spine as covers must be treated
# specially
spine[0].getparent().remove(spine[0])
guide_elem.set('href', 'calibre_raster_cover.jpg')
from calibre.ebooks.oeb.base import OPF
t = etree.SubElement(elem[0].getparent(), OPF('item'),
href=guide_elem.get('href'), id='calibre_raster_cover')
t.set('media-type', 'image/jpeg')
for elem in list(opf.iterguide()):
if elem.get('type', '').lower() == 'titlepage':
elem.getparent().remove(elem)
from calibre.ebooks.oeb.base import OPF
t = etree.SubElement(guide_elem.getparent(), OPF('reference'))
t.set('type', 'titlepage')
t.set('href', guide_cover)
t.set('title', 'Title Page')
from calibre.ebooks import render_html
renderer = render_html(guide_cover)
from calibre.ebooks import render_html_svg_workaround
renderer = render_html_svg_workaround(guide_cover, log)
if renderer is not None:
open('calibre_raster_cover.jpg', 'wb').write(
renderer.data)
renderer)
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.utils.zipfile import ZipFile
@ -121,7 +130,7 @@ class EPUBInput(InputFormatPlugin):
for elem in opf.iterguide():
elem.set('href', delta+elem.get('href'))
self.rationalize_cover(opf)
self.rationalize_cover(opf, log)
with open('content.opf', 'wb') as nopf:
nopf.write(opf.render())

View File

@ -5,14 +5,10 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''Read meta information from epub files'''
import os, time
import os
from cStringIO import StringIO
from contextlib import closing
from PyQt4.Qt import QUrl, QEventLoop, QSize, QByteArray, QBuffer, \
SIGNAL, QPainter, QImage, QObject, QApplication, Qt, QPalette
from PyQt4.QtWebKit import QWebPage
from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
from calibre.ebooks.metadata import MetaInformation
@ -102,64 +98,9 @@ class OCFDirReader(OCFReader):
def open(self, path, *args, **kwargs):
return open(os.path.join(self.root, path), *args, **kwargs)
class CoverRenderer(QObject):
WIDTH = 600
HEIGHT = 800
def __init__(self, path):
if QApplication.instance() is None:
QApplication([])
QObject.__init__(self)
self.loop = QEventLoop()
self.page = QWebPage()
pal = self.page.palette()
pal.setBrush(QPalette.Background, Qt.white)
self.page.setPalette(pal)
self.page.setViewportSize(QSize(self.WIDTH, self.HEIGHT))
self.page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)
self.page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
QObject.connect(self.page, SIGNAL('loadFinished(bool)'), self.render_html)
self._image_data = None
self.rendered = False
url = QUrl.fromLocalFile(os.path.normpath(path))
self.page.mainFrame().load(url)
def render_html(self, ok):
try:
if not ok:
self.rendered = True
return
image = QImage(self.page.viewportSize(), QImage.Format_ARGB32)
image.setDotsPerMeterX(96*(100/2.54))
image.setDotsPerMeterY(96*(100/2.54))
painter = QPainter(image)
self.page.mainFrame().render(painter)
painter.end()
ba = QByteArray()
buf = QBuffer(ba)
buf.open(QBuffer.WriteOnly)
image.save(buf, 'JPEG')
self._image_data = str(ba.data())
finally:
self.loop.exit(0)
self.rendered = True
def image_data():
def fget(self):
if not self.rendered:
self.loop.exec_()
count = 0
while count < 50 and not self.rendered:
time.sleep(0.1)
count += 1
return self._image_data
return property(fget=fget)
image_data = image_data()
def get_cover(opf, opf_path, stream):
from calibre.gui2 import is_ok_to_use_qt
if not is_ok_to_use_qt(): return None
from calibre.ebooks import render_html_svg_workaround
from calibre.utils.logging import default_log
spine = list(opf.spine_items())
if not spine:
return
@ -172,8 +113,7 @@ def get_cover(opf, opf_path, stream):
cpage = os.path.join(tdir, os.path.dirname(opf_path), cpage)
if not os.path.exists(cpage):
return
cr = CoverRenderer(cpage)
return cr.image_data
return render_html_svg_workaround(cpage, default_log)
def get_metadata(stream, extract_cover=True):
""" Return metadata as a :class:`MetaInformation` object """

View File

@ -1556,7 +1556,8 @@ class MobiWriter(object):
else:
raise NotImplementedError("missing date or timestamp needed for mobi_periodical")
if oeb.metadata.cover:
if oeb.metadata.cover and \
unicode(oeb.metadata.cover[0]) in oeb.manifest.ids:
id = unicode(oeb.metadata.cover[0])
item = oeb.manifest.ids[id]
href = item.href

View File

@ -27,7 +27,6 @@ from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \
OEBError, OEBBook, DirContainer
from calibre.ebooks.oeb.writer import OEBWriter
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
from calibre.ebooks.metadata.epub import CoverRenderer
from calibre.startup import get_lang
from calibre.ptempfile import TemporaryDirectory
from calibre.constants import __appname__, __version__
@ -346,6 +345,8 @@ class OEBReader(object):
if descriptionElement:
description = etree.tostring(descriptionElement[0],
method='text', encoding=unicode).strip()
if not description:
description = None
else :
description = None
@ -525,12 +526,14 @@ class OEBReader(object):
return
def _cover_from_html(self, hcover):
from calibre.ebooks import render_html_svg_workaround
with TemporaryDirectory('_html_cover') as tdir:
writer = OEBWriter()
writer(self.oeb, tdir)
path = os.path.join(tdir, urlunquote(hcover.href))
renderer = CoverRenderer(path)
data = renderer.image_data
data = render_html_svg_workaround(path, self.logger)
if not data:
data = ''
id, href = self.oeb.manifest.generate('cover', 'cover.jpeg')
item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data)
return item

View File

@ -102,3 +102,5 @@ class Log(object):
def __call__(self, *args, **kwargs):
self.prints(INFO, *args, **kwargs)
default_log = Log()

View File

@ -17,7 +17,7 @@ class Publico(BasicNewsRecipe):
max_articles_per_feed = 30
encoding='utf-8'
no_stylesheets = True
language = _('Portuguese')
language = _('Portugese')
preprocess_regexps = [(re.compile(u"\uFFFD", re.DOTALL|re.IGNORECASE), lambda match: ''),]
feeds = [