From f728c12835dacb7419d93e3a74076fd53492542b Mon Sep 17 00:00:00 2001
From: Oliver Graf <ograf@oli-ver-ena.de>
Date: Fri, 27 Jul 2012 15:01:47 +0200
Subject: [PATCH] Added extra metadata parsing to ODT module.

Added manual section about ODT features added and general conversion tips.
---
 manual/conversion.rst              | 28 ++++++++++
 src/calibre/ebooks/metadata/odt.py | 85 ++++++++++++++++++++++++++++--
 2 files changed, 110 insertions(+), 3 deletions(-)
diff --git a/manual/conversion.rst b/manual/conversion.rst
index 5eaca5a469..a4ecd902cc 100644
--- a/manual/conversion.rst
+++ b/manual/conversion.rst
@@ -710,3 +710,31 @@ EPUB from the ZIP file are::
 
 Note that because this file explores the potential of EPUB, most of the advanced formatting is not going to work on readers less capable than |app|'s built-in EPUB viewer. 
 
+
+Convert ODT documents
+~~~~~~~~~~~~~~~~~~~~~
+
+|app| can directly convert ODT (OpenDocument Text) files. You should use styles to format your document and minimize the use of direct formatting.
+When inserting images into your document you need to anchor them to the paragraph, images anchored to a page will all end up in the front of the conversion.
+
+To enable automatic detection of chapters, you need to mark them with the build-in styles called 'Heading 1', 'Heading 2', ..., 'Heading 6' ('Heading 1' equates to the HTML tag <h1>, 'Heading 2' to <h2> etc). When you convert in |app| you can enter which style you used into the 'Detect chapters at' box. Example:
+
+  * If you mark Chapters with style 'Heading 2', you have to set the 'Detect chapters at' box to ``//h:h2``
+  * For a nested TOC with Sections marked with 'Heading 2' and the Chapters marked with 'Heading 3' you need to enter ``//h:h2|//h:h3``. On the Convert - TOC page set the 'Level 1 TOC' box to ``//h:h2`` and the 'Level 2 TOC' box to ``//h:h3``.
+
+Well-known document properties (Title, Keywords, Description, Creator) are recognized and |app| will use the first image (not to small, and with good aspect-ratio) as the cover image.
+
+There is also an advanced property conversion mode, which is activated by setting the custom property ``opf.metadata`` ('Yes or No' type) to Yes in your ODT document (File->Properties->Custom Properties).
+If this property is detected by |app|, the following custom properties are recognized (``opf.authors`` overrides document creator)::
+
+    opf.titlesort
+    opf.authors
+    opf.authorsort
+    opf.publisher
+    opf.pubdate
+    opf.isbn
+    opf.language
+
+In addition to this, you can specify the picture to use as the cover by naming it ``opf.cover`` (right click, Picture->Options->Name) in the ODT. If no picture with this name is found, the 'smart' method is used.
+To prevent this you can set the custom property ``opf.nocover`` ('Yes or No' type) to Yes.
+
diff --git a/src/calibre/ebooks/metadata/odt.py b/src/calibre/ebooks/metadata/odt.py
index bf30dfd5f7..d795b997e2 100644
--- a/src/calibre/ebooks/metadata/odt.py
+++ b/src/calibre/ebooks/metadata/odt.py
@@ -1,5 +1,7 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+#
 # Copyright (C) 2006 Søren Roug, European Environment Agency
 #
 # This is free software.  You may redistribute it under the terms
@@ -17,12 +19,19 @@
 #
 # Contributor(s):
 #
+from __future__ import division
+
 import zipfile, re
 import xml.sax.saxutils
 from cStringIO import StringIO
 
 from odf.namespaces import OFFICENS, DCNS, METANS
-from calibre.ebooks.metadata import MetaInformation, string_to_authors
+from odf.opendocument import load as odLoad
+from odf.draw import Image as odImage, Frame as odFrame
+
+from calibre.ebooks.metadata import MetaInformation, string_to_authors, check_isbn
+from calibre.utils.magick.draw import identify_data
+from calibre.utils.date import parse_date
 
 whitespace = re.compile(r'\s+')
 
@@ -125,6 +134,10 @@ class odfmetaparser(xml.sax.saxutils.XMLGenerator):
         else:
             texttag = self._tag
         self.seenfields[texttag] = self.data()
+        # OpenOffice has the habit to capitalize custom properties, so we add a
+        # lowercase version for easy access
+        if texttag[:4].lower() == u'opf.':
+            self.seenfields[texttag.lower()] = self.data()
 
         if field in self.deletefields:
             self.output.dowrite = True
@@ -141,7 +154,7 @@ class odfmetaparser(xml.sax.saxutils.XMLGenerator):
     def data(self):
         return normalize(''.join(self._data))
 
-def get_metadata(stream):
+def get_metadata(stream, extract_cover=True):
     zin = zipfile.ZipFile(stream, 'r')
     odfs = odfmetaparser()
     parser = xml.sax.make_parser()
@@ -162,7 +175,73 @@ def get_metadata(stream):
     if data.has_key('language'):
         mi.language = data['language']
     if data.get('keywords', ''):
-        mi.tags = data['keywords'].split(',')
+        mi.tags = map(lambda x: x.strip(), data['keywords'].split(','))
+    opfmeta = False # we need this later for the cover
+    opfnocover = False
+    if data.get('opf.metadata','') == 'true':
+        # custom metadata contains OPF information
+        opfmeta = True
+        if data.get('opf.titlesort', ''):
+            mi.title_sort = data['opf.titlesort']
+        if data.get('opf.authors', ''):
+            mi.authors = string_to_authors(data['opf.authors'])
+        if data.get('opf.authorsort', ''):
+            mi.author_sort = data['opf.authorsort']
+        if data.get('opf.isbn', ''):
+            isbn = check_isbn(data['opf.isbn'])
+            if isbn is not None:
+                mi.isbn = isbn
+        if data.get('opf.publisher', ''):
+            mi.publisher = data['opf.publisher']
+        if data.get('opf.pubdate', ''):
+            mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
+        if data.get('opf.language', ''):
+            mi.languages = [ data['opf.language'] ]
+        opfnocover = data.get('opf.nocover', 'false') == 'true'
+    # search for an draw:image in a draw:frame with the name 'opf.cover' 
+    # if opf.metadata prop is false, just use the first image that
+    # has a proper size (borrowed from docx)
+    otext = odLoad(stream)
+    cover_href = None
+    cover_data = None
+    # check that it's really a ODT
+    if otext.mimetype == u'application/vnd.oasis.opendocument.text':
+        for elem in otext.text.getElementsByType(odFrame):
+            img = elem.getElementsByType(odImage)
+            if len(img) > 0: # there should be only one
+                i_href = img[0].getAttribute('href')
+                try:
+                    raw = zin.read(i_href)
+                except KeyError:
+                    continue
+                try:
+                    width, height, fmt = identify_data(raw)
+                except:
+                    continue
+            else:
+                continue
+            if opfmeta and elem.getAttribute('name').lower() == u'opf.cover':
+                cover_href = i_href
+                cover_data = (fmt, raw)
+                break
+            if cover_href is None and 0.8 <= height/width <= 1.8 and height*width >= 12000:
+                cover_href = i_href
+                cover_data = (fmt, raw)
+                if not opfmeta:
+                    break
+
+    if not opfnocover and cover_href is not None:
+        mi.cover = cover_href
+        if extract_cover:
+            if not cover_data:
+                raw = zin.read(cover_href)
+                try:
+                    width, height, fmt = identify_data(raw)
+                except:
+                    pass
+                else:
+                    cover_data = (fmt, raw)
+            mi.cover_data = cover_data
 
     return mi