From f728c12835dacb7419d93e3a74076fd53492542b Mon Sep 17 00:00:00 2001 From: Oliver Graf Date: Fri, 27 Jul 2012 15:01:47 +0200 Subject: [PATCH] Added extra metadata parsing to ODT module. Added manual section about ODT features added and general conversion tips. --- manual/conversion.rst | 28 ++++++++++ src/calibre/ebooks/metadata/odt.py | 85 ++++++++++++++++++++++++++++-- 2 files changed, 110 insertions(+), 3 deletions(-) diff --git a/manual/conversion.rst b/manual/conversion.rst index 5eaca5a469..a4ecd902cc 100644 --- a/manual/conversion.rst +++ b/manual/conversion.rst @@ -710,3 +710,31 @@ EPUB from the ZIP file are:: Note that because this file explores the potential of EPUB, most of the advanced formatting is not going to work on readers less capable than |app|'s built-in EPUB viewer. + +Convert ODT documents +~~~~~~~~~~~~~~~~~~~~~ + +|app| can directly convert ODT (OpenDocument Text) files. You should use styles to format your document and minimize the use of direct formatting. +When inserting images into your document you need to anchor them to the paragraph, images anchored to a page will all end up in the front of the conversion. + +To enable automatic detection of chapters, you need to mark them with the build-in styles called 'Heading 1', 'Heading 2', ..., 'Heading 6' ('Heading 1' equates to the HTML tag

, 'Heading 2' to

etc). When you convert in |app| you can enter which style you used into the 'Detect chapters at' box. Example: + + * If you mark Chapters with style 'Heading 2', you have to set the 'Detect chapters at' box to ``//h:h2`` + * For a nested TOC with Sections marked with 'Heading 2' and the Chapters marked with 'Heading 3' you need to enter ``//h:h2|//h:h3``. On the Convert - TOC page set the 'Level 1 TOC' box to ``//h:h2`` and the 'Level 2 TOC' box to ``//h:h3``. + +Well-known document properties (Title, Keywords, Description, Creator) are recognized and |app| will use the first image (not to small, and with good aspect-ratio) as the cover image. + +There is also an advanced property conversion mode, which is activated by setting the custom property ``opf.metadata`` ('Yes or No' type) to Yes in your ODT document (File->Properties->Custom Properties). +If this property is detected by |app|, the following custom properties are recognized (``opf.authors`` overrides document creator):: + + opf.titlesort + opf.authors + opf.authorsort + opf.publisher + opf.pubdate + opf.isbn + opf.language + +In addition to this, you can specify the picture to use as the cover by naming it ``opf.cover`` (right click, Picture->Options->Name) in the ODT. If no picture with this name is found, the 'smart' method is used. +To prevent this you can set the custom property ``opf.nocover`` ('Yes or No' type) to Yes. + diff --git a/src/calibre/ebooks/metadata/odt.py b/src/calibre/ebooks/metadata/odt.py index bf30dfd5f7..d795b997e2 100644 --- a/src/calibre/ebooks/metadata/odt.py +++ b/src/calibre/ebooks/metadata/odt.py @@ -1,5 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +# # Copyright (C) 2006 Søren Roug, European Environment Agency # # This is free software. You may redistribute it under the terms @@ -17,12 +19,19 @@ # # Contributor(s): # +from __future__ import division + import zipfile, re import xml.sax.saxutils from cStringIO import StringIO from odf.namespaces import OFFICENS, DCNS, METANS -from calibre.ebooks.metadata import MetaInformation, string_to_authors +from odf.opendocument import load as odLoad +from odf.draw import Image as odImage, Frame as odFrame + +from calibre.ebooks.metadata import MetaInformation, string_to_authors, check_isbn +from calibre.utils.magick.draw import identify_data +from calibre.utils.date import parse_date whitespace = re.compile(r'\s+') @@ -125,6 +134,10 @@ class odfmetaparser(xml.sax.saxutils.XMLGenerator): else: texttag = self._tag self.seenfields[texttag] = self.data() + # OpenOffice has the habit to capitalize custom properties, so we add a + # lowercase version for easy access + if texttag[:4].lower() == u'opf.': + self.seenfields[texttag.lower()] = self.data() if field in self.deletefields: self.output.dowrite = True @@ -141,7 +154,7 @@ class odfmetaparser(xml.sax.saxutils.XMLGenerator): def data(self): return normalize(''.join(self._data)) -def get_metadata(stream): +def get_metadata(stream, extract_cover=True): zin = zipfile.ZipFile(stream, 'r') odfs = odfmetaparser() parser = xml.sax.make_parser() @@ -162,7 +175,73 @@ def get_metadata(stream): if data.has_key('language'): mi.language = data['language'] if data.get('keywords', ''): - mi.tags = data['keywords'].split(',') + mi.tags = map(lambda x: x.strip(), data['keywords'].split(',')) + opfmeta = False # we need this later for the cover + opfnocover = False + if data.get('opf.metadata','') == 'true': + # custom metadata contains OPF information + opfmeta = True + if data.get('opf.titlesort', ''): + mi.title_sort = data['opf.titlesort'] + if data.get('opf.authors', ''): + mi.authors = string_to_authors(data['opf.authors']) + if data.get('opf.authorsort', ''): + mi.author_sort = data['opf.authorsort'] + if data.get('opf.isbn', ''): + isbn = check_isbn(data['opf.isbn']) + if isbn is not None: + mi.isbn = isbn + if data.get('opf.publisher', ''): + mi.publisher = data['opf.publisher'] + if data.get('opf.pubdate', ''): + mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) + if data.get('opf.language', ''): + mi.languages = [ data['opf.language'] ] + opfnocover = data.get('opf.nocover', 'false') == 'true' + # search for an draw:image in a draw:frame with the name 'opf.cover' + # if opf.metadata prop is false, just use the first image that + # has a proper size (borrowed from docx) + otext = odLoad(stream) + cover_href = None + cover_data = None + # check that it's really a ODT + if otext.mimetype == u'application/vnd.oasis.opendocument.text': + for elem in otext.text.getElementsByType(odFrame): + img = elem.getElementsByType(odImage) + if len(img) > 0: # there should be only one + i_href = img[0].getAttribute('href') + try: + raw = zin.read(i_href) + except KeyError: + continue + try: + width, height, fmt = identify_data(raw) + except: + continue + else: + continue + if opfmeta and elem.getAttribute('name').lower() == u'opf.cover': + cover_href = i_href + cover_data = (fmt, raw) + break + if cover_href is None and 0.8 <= height/width <= 1.8 and height*width >= 12000: + cover_href = i_href + cover_data = (fmt, raw) + if not opfmeta: + break + + if not opfnocover and cover_href is not None: + mi.cover = cover_href + if extract_cover: + if not cover_data: + raw = zin.read(cover_href) + try: + width, height, fmt = identify_data(raw) + except: + pass + else: + cover_data = (fmt, raw) + mi.cover_data = cover_data return mi