diff --git a/manual/conversion.rst b/manual/conversion.rst index a4ecd902cc..feae2a4273 100644 --- a/manual/conversion.rst +++ b/manual/conversion.rst @@ -734,7 +734,11 @@ If this property is detected by |app|, the following custom properties are recog opf.pubdate opf.isbn opf.language + opf.series + opf.seriesindex In addition to this, you can specify the picture to use as the cover by naming it ``opf.cover`` (right click, Picture->Options->Name) in the ODT. If no picture with this name is found, the 'smart' method is used. -To prevent this you can set the custom property ``opf.nocover`` ('Yes or No' type) to Yes. +As the cover detection might result in double covers in certain output formats, the process will remove the paragraph (only if the only content is the cover!) from the document. But this works only with the named picture! + +To disable cover detection you can set the custom property ``opf.nocover`` ('Yes or No' type) to Yes in advanced mode. diff --git a/src/calibre/ebooks/metadata/odt.py b/src/calibre/ebooks/metadata/odt.py index a4371a4506..b919885bfd 100644 --- a/src/calibre/ebooks/metadata/odt.py +++ b/src/calibre/ebooks/metadata/odt.py @@ -196,6 +196,13 @@ def get_metadata(stream, extract_cover=True): mi.publisher = data['opf.publisher'] if data.get('opf.pubdate', ''): mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) + if data.get('opf.series', ''): + mi.series = data['opf.series'] + if data.get('opf.seriesindex', ''): + try: + mi.series_index = float(data['opf.seriesindex']) + except ValueError: + mi.series_index = 1.0 if data.get('opf.language', ''): cl = canonicalize_lang(data['opf.language']) if cl: @@ -216,34 +223,35 @@ def read_cover(stream, zin, mi, opfmeta, extract_cover): otext = odLoad(stream) cover_href = None cover_data = None - # check that it's really a ODT - if otext.mimetype == u'application/vnd.oasis.opendocument.text': - for elem in otext.text.getElementsByType(odFrame): - img = elem.getElementsByType(odImage) - if len(img) > 0: # there should be only one - i_href = img[0].getAttribute('href') - try: - raw = zin.read(i_href) - except KeyError: - continue - try: - width, height, fmt = identify_data(raw) - except: - continue - else: + cover_frame = None + for frm in otext.topnode.getElementsByType(odFrame): + img = frm.getElementsByType(odImage) + if len(img) > 0: # there should be only one + i_href = img[0].getAttribute('href') + try: + raw = zin.read(i_href) + except KeyError: continue - if opfmeta and elem.getAttribute('name').lower() == u'opf.cover': - cover_href = i_href - cover_data = (fmt, raw) + try: + width, height, fmt = identify_data(raw) + except: + continue + else: + continue + if opfmeta and frm.getAttribute('name').lower() == u'opf.cover': + cover_href = i_href + cover_data = (fmt, raw) + cover_frame = frm.getAttribute('name') # could have upper case + break + if cover_href is None and 0.8 <= height/width <= 1.8 and height*width >= 12000: + cover_href = i_href + cover_data = (fmt, raw) + if not opfmeta: break - if cover_href is None and 0.8 <= height/width <= 1.8 and height*width >= 12000: - cover_href = i_href - cover_data = (fmt, raw) - if not opfmeta: - break if cover_href is not None: mi.cover = cover_href + mi.odf_cover_frame = cover_frame if extract_cover: if not cover_data: raw = zin.read(cover_href) diff --git a/src/calibre/ebooks/odt/input.py b/src/calibre/ebooks/odt/input.py index 1a70335a13..f0d2335a30 100644 --- a/src/calibre/ebooks/odt/input.py +++ b/src/calibre/ebooks/odt/input.py @@ -10,6 +10,9 @@ import os from lxml import etree from odf.odf2xhtml import ODF2XHTML +from odf.opendocument import load as odLoad +from odf.draw import Frame as odFrame, Image as odImage +from odf.namespaces import TEXTNS as odTEXTNS from calibre import CurrentDir, walk @@ -138,22 +141,84 @@ class Extract(ODF2XHTML): r.selectorText = '.'+replace_name return sheet.cssText, sel_map + def search_page_img(self, mi, log): + for frm in self.document.topnode.getElementsByType(odFrame): + try: + if frm.getAttrNS(odTEXTNS,u'anchor-type') == 'page': + log.warn('Document has Pictures anchored to Page, will all end up before first page!') + break + except ValueError: + pass + + def filter_cover(self, mi, log): + # filter the Element tree (remove the detected cover) + if mi.cover and mi.odf_cover_frame: + for frm in self.document.topnode.getElementsByType(odFrame): + # search the right frame + if frm.getAttribute('name') == mi.odf_cover_frame: + img = frm.getElementsByType(odImage) + # only one draw:image allowed in the draw:frame + if len(img) == 1 and img[0].getAttribute('href') == mi.cover: + # ok, this is the right frame with the right image + # check if there are more childs + if len(frm.childNodes) != 1: + break + # check if the parent paragraph more childs + para = frm.parentNode + if para.tagName != 'text:p' or len(para.childNodes) != 1: + break + # now it should be safe to remove the text:p + parent = para.parentNode + parent.removeChild(para) + log("Removed cover image paragraph from document...") + break + + def filter_load(self, odffile, mi, log): + """ This is an adaption from ODF2XHTML. It adds a step between + load and parse of the document where the Element tree can be + modified. + """ + # first load the odf structure + self.lines = [] + self._wfunc = self._wlines + if isinstance(odffile, basestring) \ + or hasattr(odffile, 'read'): # Added by Kovid + self.document = odLoad(odffile) + else: + self.document = odffile + # filter stuff + self.search_page_img(mi, log) + try: + self.filter_cover(mi, log) + except: + pass + # parse the modified tree and generate xhtml + self._walknode(self.document.topnode) + def __call__(self, stream, odir, log): from calibre.utils.zipfile import ZipFile - from calibre.ebooks.metadata.meta import get_metadata + from calibre.ebooks.metadata.odt import get_metadata from calibre.ebooks.metadata.opf2 import OPFCreator - from calibre.customize.ui import quick_metadata if not os.path.exists(odir): os.makedirs(odir) with CurrentDir(odir): log('Extracting ODT file...') - html = self.odf2xhtml(stream) + stream.seek(0) + mi = get_metadata(stream, 'odt') + if not mi.title: + mi.title = _('Unknown') + if not mi.authors: + mi.authors = [_('Unknown')] + self.filter_load(stream, mi, log) + html = self.xhtml() # A blanket img specification like this causes problems # with EPUB output as the containing element often has # an absolute height and width set that is larger than # the available screen real estate html = html.replace('img { width: 100%; height: 100%; }', '') + # odf2xhtml creates empty title tag + html = html.replace('