Added extra metadata parsing to ODT module.

Added manual section about ODT features added and general conversion tips.
This commit is contained in:
Oliver Graf 2012-07-27 15:01:47 +02:00
parent c2ca92a97a
commit f728c12835
2 changed files with 110 additions and 3 deletions

View File

@ -710,3 +710,31 @@ EPUB from the ZIP file are::
Note that because this file explores the potential of EPUB, most of the advanced formatting is not going to work on readers less capable than |app|'s built-in EPUB viewer.
Convert ODT documents
~~~~~~~~~~~~~~~~~~~~~
|app| can directly convert ODT (OpenDocument Text) files. You should use styles to format your document and minimize the use of direct formatting.
When inserting images into your document you need to anchor them to the paragraph, images anchored to a page will all end up in the front of the conversion.
To enable automatic detection of chapters, you need to mark them with the build-in styles called 'Heading 1', 'Heading 2', ..., 'Heading 6' ('Heading 1' equates to the HTML tag <h1>, 'Heading 2' to <h2> etc). When you convert in |app| you can enter which style you used into the 'Detect chapters at' box. Example:
* If you mark Chapters with style 'Heading 2', you have to set the 'Detect chapters at' box to ``//h:h2``
* For a nested TOC with Sections marked with 'Heading 2' and the Chapters marked with 'Heading 3' you need to enter ``//h:h2|//h:h3``. On the Convert - TOC page set the 'Level 1 TOC' box to ``//h:h2`` and the 'Level 2 TOC' box to ``//h:h3``.
Well-known document properties (Title, Keywords, Description, Creator) are recognized and |app| will use the first image (not to small, and with good aspect-ratio) as the cover image.
There is also an advanced property conversion mode, which is activated by setting the custom property ``opf.metadata`` ('Yes or No' type) to Yes in your ODT document (File->Properties->Custom Properties).
If this property is detected by |app|, the following custom properties are recognized (``opf.authors`` overrides document creator)::
opf.titlesort
opf.authors
opf.authorsort
opf.publisher
opf.pubdate
opf.isbn
opf.language
In addition to this, you can specify the picture to use as the cover by naming it ``opf.cover`` (right click, Picture->Options->Name) in the ODT. If no picture with this name is found, the 'smart' method is used.
To prevent this you can set the custom property ``opf.nocover`` ('Yes or No' type) to Yes.

View File

@ -1,5 +1,7 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
#
# Copyright (C) 2006 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
@ -17,12 +19,19 @@
#
# Contributor(s):
#
from __future__ import division
import zipfile, re
import xml.sax.saxutils
from cStringIO import StringIO
from odf.namespaces import OFFICENS, DCNS, METANS
from calibre.ebooks.metadata import MetaInformation, string_to_authors
from odf.opendocument import load as odLoad
from odf.draw import Image as odImage, Frame as odFrame
from calibre.ebooks.metadata import MetaInformation, string_to_authors, check_isbn
from calibre.utils.magick.draw import identify_data
from calibre.utils.date import parse_date
whitespace = re.compile(r'\s+')
@ -125,6 +134,10 @@ class odfmetaparser(xml.sax.saxutils.XMLGenerator):
else:
texttag = self._tag
self.seenfields[texttag] = self.data()
# OpenOffice has the habit to capitalize custom properties, so we add a
# lowercase version for easy access
if texttag[:4].lower() == u'opf.':
self.seenfields[texttag.lower()] = self.data()
if field in self.deletefields:
self.output.dowrite = True
@ -141,7 +154,7 @@ class odfmetaparser(xml.sax.saxutils.XMLGenerator):
def data(self):
return normalize(''.join(self._data))
def get_metadata(stream):
def get_metadata(stream, extract_cover=True):
zin = zipfile.ZipFile(stream, 'r')
odfs = odfmetaparser()
parser = xml.sax.make_parser()
@ -162,7 +175,73 @@ def get_metadata(stream):
if data.has_key('language'):
mi.language = data['language']
if data.get('keywords', ''):
mi.tags = data['keywords'].split(',')
mi.tags = map(lambda x: x.strip(), data['keywords'].split(','))
opfmeta = False # we need this later for the cover
opfnocover = False
if data.get('opf.metadata','') == 'true':
# custom metadata contains OPF information
opfmeta = True
if data.get('opf.titlesort', ''):
mi.title_sort = data['opf.titlesort']
if data.get('opf.authors', ''):
mi.authors = string_to_authors(data['opf.authors'])
if data.get('opf.authorsort', ''):
mi.author_sort = data['opf.authorsort']
if data.get('opf.isbn', ''):
isbn = check_isbn(data['opf.isbn'])
if isbn is not None:
mi.isbn = isbn
if data.get('opf.publisher', ''):
mi.publisher = data['opf.publisher']
if data.get('opf.pubdate', ''):
mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
if data.get('opf.language', ''):
mi.languages = [ data['opf.language'] ]
opfnocover = data.get('opf.nocover', 'false') == 'true'
# search for an draw:image in a draw:frame with the name 'opf.cover'
# if opf.metadata prop is false, just use the first image that
# has a proper size (borrowed from docx)
otext = odLoad(stream)
cover_href = None
cover_data = None
# check that it's really a ODT
if otext.mimetype == u'application/vnd.oasis.opendocument.text':
for elem in otext.text.getElementsByType(odFrame):
img = elem.getElementsByType(odImage)
if len(img) > 0: # there should be only one
i_href = img[0].getAttribute('href')
try:
raw = zin.read(i_href)
except KeyError:
continue
try:
width, height, fmt = identify_data(raw)
except:
continue
else:
continue
if opfmeta and elem.getAttribute('name').lower() == u'opf.cover':
cover_href = i_href
cover_data = (fmt, raw)
break
if cover_href is None and 0.8 <= height/width <= 1.8 and height*width >= 12000:
cover_href = i_href
cover_data = (fmt, raw)
if not opfmeta:
break
if not opfnocover and cover_href is not None:
mi.cover = cover_href
if extract_cover:
if not cover_data:
raw = zin.read(cover_href)
try:
width, height, fmt = identify_data(raw)
except:
pass
else:
cover_data = (fmt, raw)
mi.cover_data = cover_data
return mi