mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Added extra metadata parsing to ODT module.
Added manual section about ODT features added and general conversion tips.
This commit is contained in:
parent
c2ca92a97a
commit
f728c12835
@ -710,3 +710,31 @@ EPUB from the ZIP file are::
|
||||
|
||||
Note that because this file explores the potential of EPUB, most of the advanced formatting is not going to work on readers less capable than |app|'s built-in EPUB viewer.
|
||||
|
||||
|
||||
Convert ODT documents
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|app| can directly convert ODT (OpenDocument Text) files. You should use styles to format your document and minimize the use of direct formatting.
|
||||
When inserting images into your document you need to anchor them to the paragraph, images anchored to a page will all end up in the front of the conversion.
|
||||
|
||||
To enable automatic detection of chapters, you need to mark them with the build-in styles called 'Heading 1', 'Heading 2', ..., 'Heading 6' ('Heading 1' equates to the HTML tag <h1>, 'Heading 2' to <h2> etc). When you convert in |app| you can enter which style you used into the 'Detect chapters at' box. Example:
|
||||
|
||||
* If you mark Chapters with style 'Heading 2', you have to set the 'Detect chapters at' box to ``//h:h2``
|
||||
* For a nested TOC with Sections marked with 'Heading 2' and the Chapters marked with 'Heading 3' you need to enter ``//h:h2|//h:h3``. On the Convert - TOC page set the 'Level 1 TOC' box to ``//h:h2`` and the 'Level 2 TOC' box to ``//h:h3``.
|
||||
|
||||
Well-known document properties (Title, Keywords, Description, Creator) are recognized and |app| will use the first image (not to small, and with good aspect-ratio) as the cover image.
|
||||
|
||||
There is also an advanced property conversion mode, which is activated by setting the custom property ``opf.metadata`` ('Yes or No' type) to Yes in your ODT document (File->Properties->Custom Properties).
|
||||
If this property is detected by |app|, the following custom properties are recognized (``opf.authors`` overrides document creator)::
|
||||
|
||||
opf.titlesort
|
||||
opf.authors
|
||||
opf.authorsort
|
||||
opf.publisher
|
||||
opf.pubdate
|
||||
opf.isbn
|
||||
opf.language
|
||||
|
||||
In addition to this, you can specify the picture to use as the cover by naming it ``opf.cover`` (right click, Picture->Options->Name) in the ODT. If no picture with this name is found, the 'smart' method is used.
|
||||
To prevent this you can set the custom property ``opf.nocover`` ('Yes or No' type) to Yes.
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||
#
|
||||
# Copyright (C) 2006 Søren Roug, European Environment Agency
|
||||
#
|
||||
# This is free software. You may redistribute it under the terms
|
||||
@ -17,12 +19,19 @@
|
||||
#
|
||||
# Contributor(s):
|
||||
#
|
||||
from __future__ import division
|
||||
|
||||
import zipfile, re
|
||||
import xml.sax.saxutils
|
||||
from cStringIO import StringIO
|
||||
|
||||
from odf.namespaces import OFFICENS, DCNS, METANS
|
||||
from calibre.ebooks.metadata import MetaInformation, string_to_authors
|
||||
from odf.opendocument import load as odLoad
|
||||
from odf.draw import Image as odImage, Frame as odFrame
|
||||
|
||||
from calibre.ebooks.metadata import MetaInformation, string_to_authors, check_isbn
|
||||
from calibre.utils.magick.draw import identify_data
|
||||
from calibre.utils.date import parse_date
|
||||
|
||||
whitespace = re.compile(r'\s+')
|
||||
|
||||
@ -125,6 +134,10 @@ class odfmetaparser(xml.sax.saxutils.XMLGenerator):
|
||||
else:
|
||||
texttag = self._tag
|
||||
self.seenfields[texttag] = self.data()
|
||||
# OpenOffice has the habit to capitalize custom properties, so we add a
|
||||
# lowercase version for easy access
|
||||
if texttag[:4].lower() == u'opf.':
|
||||
self.seenfields[texttag.lower()] = self.data()
|
||||
|
||||
if field in self.deletefields:
|
||||
self.output.dowrite = True
|
||||
@ -141,7 +154,7 @@ class odfmetaparser(xml.sax.saxutils.XMLGenerator):
|
||||
def data(self):
|
||||
return normalize(''.join(self._data))
|
||||
|
||||
def get_metadata(stream):
|
||||
def get_metadata(stream, extract_cover=True):
|
||||
zin = zipfile.ZipFile(stream, 'r')
|
||||
odfs = odfmetaparser()
|
||||
parser = xml.sax.make_parser()
|
||||
@ -162,7 +175,73 @@ def get_metadata(stream):
|
||||
if data.has_key('language'):
|
||||
mi.language = data['language']
|
||||
if data.get('keywords', ''):
|
||||
mi.tags = data['keywords'].split(',')
|
||||
mi.tags = map(lambda x: x.strip(), data['keywords'].split(','))
|
||||
opfmeta = False # we need this later for the cover
|
||||
opfnocover = False
|
||||
if data.get('opf.metadata','') == 'true':
|
||||
# custom metadata contains OPF information
|
||||
opfmeta = True
|
||||
if data.get('opf.titlesort', ''):
|
||||
mi.title_sort = data['opf.titlesort']
|
||||
if data.get('opf.authors', ''):
|
||||
mi.authors = string_to_authors(data['opf.authors'])
|
||||
if data.get('opf.authorsort', ''):
|
||||
mi.author_sort = data['opf.authorsort']
|
||||
if data.get('opf.isbn', ''):
|
||||
isbn = check_isbn(data['opf.isbn'])
|
||||
if isbn is not None:
|
||||
mi.isbn = isbn
|
||||
if data.get('opf.publisher', ''):
|
||||
mi.publisher = data['opf.publisher']
|
||||
if data.get('opf.pubdate', ''):
|
||||
mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
|
||||
if data.get('opf.language', ''):
|
||||
mi.languages = [ data['opf.language'] ]
|
||||
opfnocover = data.get('opf.nocover', 'false') == 'true'
|
||||
# search for an draw:image in a draw:frame with the name 'opf.cover'
|
||||
# if opf.metadata prop is false, just use the first image that
|
||||
# has a proper size (borrowed from docx)
|
||||
otext = odLoad(stream)
|
||||
cover_href = None
|
||||
cover_data = None
|
||||
# check that it's really a ODT
|
||||
if otext.mimetype == u'application/vnd.oasis.opendocument.text':
|
||||
for elem in otext.text.getElementsByType(odFrame):
|
||||
img = elem.getElementsByType(odImage)
|
||||
if len(img) > 0: # there should be only one
|
||||
i_href = img[0].getAttribute('href')
|
||||
try:
|
||||
raw = zin.read(i_href)
|
||||
except KeyError:
|
||||
continue
|
||||
try:
|
||||
width, height, fmt = identify_data(raw)
|
||||
except:
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
if opfmeta and elem.getAttribute('name').lower() == u'opf.cover':
|
||||
cover_href = i_href
|
||||
cover_data = (fmt, raw)
|
||||
break
|
||||
if cover_href is None and 0.8 <= height/width <= 1.8 and height*width >= 12000:
|
||||
cover_href = i_href
|
||||
cover_data = (fmt, raw)
|
||||
if not opfmeta:
|
||||
break
|
||||
|
||||
if not opfnocover and cover_href is not None:
|
||||
mi.cover = cover_href
|
||||
if extract_cover:
|
||||
if not cover_data:
|
||||
raw = zin.read(cover_href)
|
||||
try:
|
||||
width, height, fmt = identify_data(raw)
|
||||
except:
|
||||
pass
|
||||
else:
|
||||
cover_data = (fmt, raw)
|
||||
mi.cover_data = cover_data
|
||||
|
||||
return mi
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user