Fix handling of reflowable covers in EPUB input. Also try to extract svg embedded raster covers. Misc. minor fixes

This commit is contained in:
Kovid Goyal 2009-07-22 13:48:05 -06:00
parent a20d9fb169
commit d0e1fa2d90
8 changed files with 63 additions and 77 deletions

View File

@ -361,6 +361,8 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252'):
return '&'+ent+';' return '&'+ent+';'
if ent == 'apos': if ent == 'apos':
return "'" return "'"
if ent == 'hellips':
ent = 'hellip'
if ent.startswith(u'#x'): if ent.startswith(u'#x'):
num = int(ent[2:], 16) num = int(ent[2:], 16)
if encoding is None or num > 255: if encoding is None or num > 255:

View File

@ -57,6 +57,35 @@ class HTMLRenderer(object):
self.loop.exit(0) self.loop.exit(0)
def extract_cover_from_embedded_svg(html, base, log):
from lxml import etree
from calibre.ebooks.oeb.base import XPath, SVG, XLINK
root = etree.fromstring(html)
svg = XPath('//svg:svg')(root)
if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):
image = svg[0][0]
href = image.get(XLINK('href'), None)
path = os.path.join(base, *href.split('/'))
if href and os.access(path, os.R_OK):
return open(path, 'rb').read()
def render_html_svg_workaround(path_to_html, log, width=590, height=750):
from calibre.ebooks.oeb.base import SVG_NS
raw = open(path_to_html, 'rb').read()
data = None
if SVG_NS in raw:
try:
data = extract_cover_from_embedded_svg(raw,
os.path.dirname(path_to_html), log)
except:
pass
if data is None:
renderer = render_html(path_to_html, width, height)
data = getattr(renderer, 'data', None)
return data
def render_html(path_to_html, width=590, height=750): def render_html(path_to_html, width=590, height=750):
from PyQt4.QtWebKit import QWebPage from PyQt4.QtWebKit import QWebPage
from PyQt4.Qt import QEventLoop, QPalette, Qt, SIGNAL, QUrl, QSize from PyQt4.Qt import QEventLoop, QPalette, Qt, SIGNAL, QUrl, QSize

View File

@ -54,7 +54,7 @@ class EPUBInput(InputFormatPlugin):
return False return False
@classmethod @classmethod
def rationalize_cover(self, opf): def rationalize_cover(self, opf, log):
guide_cover, guide_elem = None, None guide_cover, guide_elem = None, None
for guide_elem in opf.iterguide(): for guide_elem in opf.iterguide():
if guide_elem.get('type', '').lower() == 'cover': if guide_elem.get('type', '').lower() == 'cover':
@ -65,28 +65,37 @@ class EPUBInput(InputFormatPlugin):
spine = list(opf.iterspine()) spine = list(opf.iterspine())
if not spine: if not spine:
return return
# Check if the cover specified in the guide is also
# the first element in spine
idref = spine[0].get('idref', '') idref = spine[0].get('idref', '')
manifest = list(opf.itermanifest()) manifest = list(opf.itermanifest())
if not manifest: if not manifest:
return return
if manifest[0].get('id', False) != idref: elem = [x for x in manifest if x.get('id', '') == idref]
if not elem or elem[0].get('href', None) != guide_cover:
return return
log('Found HTML cover', guide_cover)
# Remove from spine as covers must be treated
# specially
spine[0].getparent().remove(spine[0]) spine[0].getparent().remove(spine[0])
guide_elem.set('href', 'calibre_raster_cover.jpg') guide_elem.set('href', 'calibre_raster_cover.jpg')
from calibre.ebooks.oeb.base import OPF
t = etree.SubElement(elem[0].getparent(), OPF('item'),
href=guide_elem.get('href'), id='calibre_raster_cover')
t.set('media-type', 'image/jpeg')
for elem in list(opf.iterguide()): for elem in list(opf.iterguide()):
if elem.get('type', '').lower() == 'titlepage': if elem.get('type', '').lower() == 'titlepage':
elem.getparent().remove(elem) elem.getparent().remove(elem)
from calibre.ebooks.oeb.base import OPF
t = etree.SubElement(guide_elem.getparent(), OPF('reference')) t = etree.SubElement(guide_elem.getparent(), OPF('reference'))
t.set('type', 'titlepage') t.set('type', 'titlepage')
t.set('href', guide_cover) t.set('href', guide_cover)
t.set('title', 'Title Page') t.set('title', 'Title Page')
from calibre.ebooks import render_html from calibre.ebooks import render_html_svg_workaround
renderer = render_html(guide_cover) renderer = render_html_svg_workaround(guide_cover, log)
if renderer is not None: if renderer is not None:
open('calibre_raster_cover.jpg', 'wb').write( open('calibre_raster_cover.jpg', 'wb').write(
renderer.data) renderer)
def convert(self, stream, options, file_ext, log, accelerators): def convert(self, stream, options, file_ext, log, accelerators):
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
@ -121,7 +130,7 @@ class EPUBInput(InputFormatPlugin):
for elem in opf.iterguide(): for elem in opf.iterguide():
elem.set('href', delta+elem.get('href')) elem.set('href', delta+elem.get('href'))
self.rationalize_cover(opf) self.rationalize_cover(opf, log)
with open('content.opf', 'wb') as nopf: with open('content.opf', 'wb') as nopf:
nopf.write(opf.render()) nopf.write(opf.render())

View File

@ -5,14 +5,10 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''Read meta information from epub files''' '''Read meta information from epub files'''
import os, time import os
from cStringIO import StringIO from cStringIO import StringIO
from contextlib import closing from contextlib import closing
from PyQt4.Qt import QUrl, QEventLoop, QSize, QByteArray, QBuffer, \
SIGNAL, QPainter, QImage, QObject, QApplication, Qt, QPalette
from PyQt4.QtWebKit import QWebPage
from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace from calibre.utils.zipfile import ZipFile, BadZipfile, safe_replace
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
@ -102,64 +98,9 @@ class OCFDirReader(OCFReader):
def open(self, path, *args, **kwargs): def open(self, path, *args, **kwargs):
return open(os.path.join(self.root, path), *args, **kwargs) return open(os.path.join(self.root, path), *args, **kwargs)
class CoverRenderer(QObject):
WIDTH = 600
HEIGHT = 800
def __init__(self, path):
if QApplication.instance() is None:
QApplication([])
QObject.__init__(self)
self.loop = QEventLoop()
self.page = QWebPage()
pal = self.page.palette()
pal.setBrush(QPalette.Background, Qt.white)
self.page.setPalette(pal)
self.page.setViewportSize(QSize(self.WIDTH, self.HEIGHT))
self.page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)
self.page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
QObject.connect(self.page, SIGNAL('loadFinished(bool)'), self.render_html)
self._image_data = None
self.rendered = False
url = QUrl.fromLocalFile(os.path.normpath(path))
self.page.mainFrame().load(url)
def render_html(self, ok):
try:
if not ok:
self.rendered = True
return
image = QImage(self.page.viewportSize(), QImage.Format_ARGB32)
image.setDotsPerMeterX(96*(100/2.54))
image.setDotsPerMeterY(96*(100/2.54))
painter = QPainter(image)
self.page.mainFrame().render(painter)
painter.end()
ba = QByteArray()
buf = QBuffer(ba)
buf.open(QBuffer.WriteOnly)
image.save(buf, 'JPEG')
self._image_data = str(ba.data())
finally:
self.loop.exit(0)
self.rendered = True
def image_data():
def fget(self):
if not self.rendered:
self.loop.exec_()
count = 0
while count < 50 and not self.rendered:
time.sleep(0.1)
count += 1
return self._image_data
return property(fget=fget)
image_data = image_data()
def get_cover(opf, opf_path, stream): def get_cover(opf, opf_path, stream):
from calibre.gui2 import is_ok_to_use_qt from calibre.ebooks import render_html_svg_workaround
if not is_ok_to_use_qt(): return None from calibre.utils.logging import default_log
spine = list(opf.spine_items()) spine = list(opf.spine_items())
if not spine: if not spine:
return return
@ -172,8 +113,7 @@ def get_cover(opf, opf_path, stream):
cpage = os.path.join(tdir, os.path.dirname(opf_path), cpage) cpage = os.path.join(tdir, os.path.dirname(opf_path), cpage)
if not os.path.exists(cpage): if not os.path.exists(cpage):
return return
cr = CoverRenderer(cpage) return render_html_svg_workaround(cpage, default_log)
return cr.image_data
def get_metadata(stream, extract_cover=True): def get_metadata(stream, extract_cover=True):
""" Return metadata as a :class:`MetaInformation` object """ """ Return metadata as a :class:`MetaInformation` object """

View File

@ -1556,7 +1556,8 @@ class MobiWriter(object):
else: else:
raise NotImplementedError("missing date or timestamp needed for mobi_periodical") raise NotImplementedError("missing date or timestamp needed for mobi_periodical")
if oeb.metadata.cover: if oeb.metadata.cover and \
unicode(oeb.metadata.cover[0]) in oeb.manifest.ids:
id = unicode(oeb.metadata.cover[0]) id = unicode(oeb.metadata.cover[0])
item = oeb.manifest.ids[id] item = oeb.manifest.ids[id]
href = item.href href = item.href

View File

@ -27,7 +27,6 @@ from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \
OEBError, OEBBook, DirContainer OEBError, OEBBook, DirContainer
from calibre.ebooks.oeb.writer import OEBWriter from calibre.ebooks.oeb.writer import OEBWriter
from calibre.ebooks.oeb.entitydefs import ENTITYDEFS from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
from calibre.ebooks.metadata.epub import CoverRenderer
from calibre.startup import get_lang from calibre.startup import get_lang
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
from calibre.constants import __appname__, __version__ from calibre.constants import __appname__, __version__
@ -346,6 +345,8 @@ class OEBReader(object):
if descriptionElement: if descriptionElement:
description = etree.tostring(descriptionElement[0], description = etree.tostring(descriptionElement[0],
method='text', encoding=unicode).strip() method='text', encoding=unicode).strip()
if not description:
description = None
else : else :
description = None description = None
@ -525,12 +526,14 @@ class OEBReader(object):
return return
def _cover_from_html(self, hcover): def _cover_from_html(self, hcover):
from calibre.ebooks import render_html_svg_workaround
with TemporaryDirectory('_html_cover') as tdir: with TemporaryDirectory('_html_cover') as tdir:
writer = OEBWriter() writer = OEBWriter()
writer(self.oeb, tdir) writer(self.oeb, tdir)
path = os.path.join(tdir, urlunquote(hcover.href)) path = os.path.join(tdir, urlunquote(hcover.href))
renderer = CoverRenderer(path) data = render_html_svg_workaround(path, self.logger)
data = renderer.image_data if not data:
data = ''
id, href = self.oeb.manifest.generate('cover', 'cover.jpeg') id, href = self.oeb.manifest.generate('cover', 'cover.jpeg')
item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data) item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data)
return item return item

View File

@ -102,3 +102,5 @@ class Log(object):
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
self.prints(INFO, *args, **kwargs) self.prints(INFO, *args, **kwargs)
default_log = Log()

View File

@ -17,7 +17,7 @@ class Publico(BasicNewsRecipe):
max_articles_per_feed = 30 max_articles_per_feed = 30
encoding='utf-8' encoding='utf-8'
no_stylesheets = True no_stylesheets = True
language = _('Portuguese') language = _('Portugese')
preprocess_regexps = [(re.compile(u"\uFFFD", re.DOTALL|re.IGNORECASE), lambda match: ''),] preprocess_regexps = [(re.compile(u"\uFFFD", re.DOTALL|re.IGNORECASE), lambda match: ''),]
feeds = [ feeds = [