mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Improved FB2 metadata reading, by Roman Mukhin
This commit is contained in:
parent
62cfdc8023
commit
2a9e73ff12
@ -1,96 +1,235 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from __future__ import with_statement
|
from __future__ import with_statement
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
|
__copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>, '\
|
||||||
|
'2008, Anatoly Shipitsin <norguhtar at gmail.com>'
|
||||||
'''Read meta information from fb2 files'''
|
'''Read meta information from fb2 files'''
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import datetime
|
||||||
|
from functools import partial
|
||||||
from base64 import b64decode
|
from base64 import b64decode
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.utils.date import parse_date
|
||||||
|
from calibre import guess_all_extensions, prints, force_unicode
|
||||||
|
from calibre.ebooks.metadata import MetaInformation, check_isbn
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre import guess_all_extensions
|
|
||||||
|
|
||||||
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
|
||||||
def XLINK(name):
|
|
||||||
return '{%s}%s' % (XLINK_NS, name)
|
|
||||||
|
|
||||||
|
NAMESPACES = {
|
||||||
|
'fb2' : 'http://www.gribuser.ru/xml/fictionbook/2.0',
|
||||||
|
'xlink' : 'http://www.w3.org/1999/xlink' }
|
||||||
|
|
||||||
|
XPath = partial(etree.XPath, namespaces=NAMESPACES)
|
||||||
|
tostring = partial(etree.tostring, method='text', encoding=unicode)
|
||||||
|
|
||||||
def get_metadata(stream):
|
def get_metadata(stream):
|
||||||
""" Return metadata as a L{MetaInfo} object """
|
""" Return fb2 metadata as a L{MetaInformation} object """
|
||||||
XPath = lambda x : etree.XPath(x,
|
|
||||||
namespaces={'fb2':'http://www.gribuser.ru/xml/fictionbook/2.0',
|
root = _get_fbroot(stream)
|
||||||
'xlink':XLINK_NS})
|
|
||||||
tostring = lambda x : etree.tostring(x, method='text',
|
book_title = _parse_book_title(root)
|
||||||
encoding=unicode).strip()
|
authors = _parse_authors(root)
|
||||||
|
|
||||||
|
# fallback for book_title
|
||||||
|
if book_title:
|
||||||
|
book_title = unicode(book_title)
|
||||||
|
else:
|
||||||
|
book_title = force_unicode(os.path.splitext(
|
||||||
|
os.path.basename(getattr(stream, 'name',
|
||||||
|
_('Unknown'))))[0])
|
||||||
|
mi = MetaInformation(book_title, authors)
|
||||||
|
|
||||||
|
try:
|
||||||
|
_parse_cover(root, mi)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
_parse_comments(root, mi)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
_parse_tags(root, mi)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
_parse_series(root, mi)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
_parse_isbn(root, mi)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
_parse_publisher(root, mi)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
_parse_pubdate(root, mi)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
_parse_timestamp(root, mi)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
_parse_language(root, mi)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
#_parse_uuid(root, mi)
|
||||||
|
|
||||||
|
#if DEBUG:
|
||||||
|
# prints(mi)
|
||||||
|
return mi
|
||||||
|
|
||||||
|
def _parse_authors(root):
|
||||||
|
authors = []
|
||||||
|
# pick up authors but only from 1 secrion <title-info>; otherwise it is not consistent!
|
||||||
|
# Those are fallbacks: <src-title-info>, <document-info>
|
||||||
|
for author_sec in ['title-info', 'src-title-info', 'document-info']:
|
||||||
|
for au in XPath('//fb2:%s/fb2:author'%author_sec)(root):
|
||||||
|
author = _parse_author(au)
|
||||||
|
if author:
|
||||||
|
authors.append(author)
|
||||||
|
if author:
|
||||||
|
break
|
||||||
|
|
||||||
|
# if no author so far
|
||||||
|
if not authors:
|
||||||
|
authors.append(_('Unknown'))
|
||||||
|
|
||||||
|
return authors
|
||||||
|
|
||||||
|
def _parse_author(elm_author):
|
||||||
|
""" Returns a list of display author and sortable author"""
|
||||||
|
|
||||||
|
xp_templ = 'normalize-space(fb2:%s/text())'
|
||||||
|
|
||||||
|
author = XPath(xp_templ % 'first-name')(elm_author)
|
||||||
|
lname = XPath(xp_templ % 'last-name')(elm_author)
|
||||||
|
mname = XPath(xp_templ % 'middle-name')(elm_author)
|
||||||
|
|
||||||
|
if mname:
|
||||||
|
author = (author + ' ' + mname).strip()
|
||||||
|
if lname:
|
||||||
|
author = (author + ' ' + lname).strip()
|
||||||
|
|
||||||
|
# fallback to nickname
|
||||||
|
if not author:
|
||||||
|
nname = XPath(xp_templ % 'nickname')(elm_author)
|
||||||
|
if nname:
|
||||||
|
author = nname
|
||||||
|
|
||||||
|
return author
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_book_title(root):
|
||||||
|
# <title-info> has a priority. (actually <title-info> is mandatory)
|
||||||
|
# other are backup solution (sequence is important. other then in fb2-doc)
|
||||||
|
xp_ti = '//fb2:title-info/fb2:book-title/text()'
|
||||||
|
xp_pi = '//fb2:publish-info/fb2:book-title/text()'
|
||||||
|
xp_si = '//fb2:src-title-info/fb2:book-title/text()'
|
||||||
|
book_title = XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root)
|
||||||
|
|
||||||
|
return book_title
|
||||||
|
|
||||||
|
def _parse_cover(root, mi):
|
||||||
|
# pickup from <title-info>, if not exists it fallbacks to <src-title-info>
|
||||||
|
imgid = XPath('substring-after(string(//fb2:coverpage/fb2:image/@xlink:href), "#")')(root)
|
||||||
|
if imgid:
|
||||||
|
try:
|
||||||
|
_parse_cover_data(root, imgid, mi)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _parse_cover_data(root, imgid, mi):
|
||||||
|
elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root)
|
||||||
|
if elm_binary:
|
||||||
|
mimetype = elm_binary[0].get('content-type', 'image/jpeg')
|
||||||
|
mime_extensions = guess_all_extensions(mimetype)
|
||||||
|
if mime_extensions:
|
||||||
|
pic_data = elm_binary[0].text
|
||||||
|
if pic_data:
|
||||||
|
mi.cover_data = (mime_extensions[0][1:], b64decode(pic_data))
|
||||||
|
else:
|
||||||
|
prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid) )
|
||||||
|
|
||||||
|
def _parse_tags(root, mi):
|
||||||
|
# pick up genre but only from 1 secrion <title-info>; otherwise it is not consistent!
|
||||||
|
# Those are fallbacks: <src-title-info>
|
||||||
|
for genre_sec in ['title-info', 'src-title-info']:
|
||||||
|
# -- i18n Translations-- ?
|
||||||
|
tags = XPath('//fb2:%s/fb2:genre/text()' % genre_sec)(root)
|
||||||
|
if tags:
|
||||||
|
mi.tags = list(map(unicode, tags))
|
||||||
|
break
|
||||||
|
|
||||||
|
def _parse_series(root, mi):
|
||||||
|
#calibri supports only 1 series: use the 1-st one
|
||||||
|
# pick up sequence but only from 1 secrion in prefered order
|
||||||
|
# except <src-title-info>
|
||||||
|
xp_ti = '//fb2:title-info/fb2:sequence[1]'
|
||||||
|
xp_pi = '//fb2:publish-info/fb2:sequence[1]'
|
||||||
|
|
||||||
|
elms_sequence = XPath('%s|%s' % (xp_ti, xp_pi))(root)
|
||||||
|
if elms_sequence:
|
||||||
|
mi.series = elms_sequence[0].get('name', None)
|
||||||
|
if mi.series:
|
||||||
|
mi.series_index = elms_sequence[0].get('number', None)
|
||||||
|
|
||||||
|
def _parse_isbn(root, mi):
|
||||||
|
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
|
||||||
|
isbn = XPath('normalize-space(//fb2:publish-info/fb2:isbn/text())')(root)
|
||||||
|
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
|
||||||
|
if ',' in isbn:
|
||||||
|
isbn = isbn[:isbn.index(',')]
|
||||||
|
if check_isbn(isbn):
|
||||||
|
mi.isbn = isbn
|
||||||
|
|
||||||
|
def _parse_comments(root, mi):
|
||||||
|
# pick up annotation but only from 1 secrion <title-info>; fallback: <src-title-info>
|
||||||
|
for annotation_sec in ['title-info', 'src-title-info']:
|
||||||
|
elms_annotation = XPath('//fb2:%s/fb2:annotation' % annotation_sec)(root)
|
||||||
|
if elms_annotation:
|
||||||
|
mi.comments = tostring(elms_annotation[0])
|
||||||
|
# TODO: tags i18n, xslt?
|
||||||
|
break
|
||||||
|
|
||||||
|
def _parse_publisher(root, mi):
|
||||||
|
publisher = XPath('string(//fb2:publish-info/fb2:publisher/text())')(root)
|
||||||
|
if publisher:
|
||||||
|
mi.publisher = publisher
|
||||||
|
|
||||||
|
def _parse_pubdate(root, mi):
|
||||||
|
year = XPath('number(//fb2:publish-info/fb2:year/text())')(root)
|
||||||
|
if float.is_integer(year):
|
||||||
|
# only year is available, so use 1-st of Jan
|
||||||
|
mi.pubdate = datetime.date(int(year), 1, 1)
|
||||||
|
|
||||||
|
def _parse_timestamp(root, mi):
|
||||||
|
#<date value="1996-12-03">03.12.1996</date>
|
||||||
|
xp ='//fb2:document-info/fb2:date/@value|'\
|
||||||
|
'//fb2:document-info/fb2:date/text()'
|
||||||
|
docdate = XPath('string(%s)' % xp)(root)
|
||||||
|
if docdate:
|
||||||
|
mi.timestamp = parse_date(docdate)
|
||||||
|
|
||||||
|
def _parse_language(root, mi):
|
||||||
|
language = XPath('string(//fb2:title-info/fb2:lang/text())')(root)
|
||||||
|
if language:
|
||||||
|
mi.language = language
|
||||||
|
mi.languages = [ language ]
|
||||||
|
|
||||||
|
def _parse_uuid(root, mi):
|
||||||
|
uuid = XPath('normalize-space(//document-info/fb2:id/text())')(root)
|
||||||
|
if uuid:
|
||||||
|
mi.uuid = uuid
|
||||||
|
|
||||||
|
def _get_fbroot(stream):
|
||||||
parser = etree.XMLParser(recover=True, no_network=True)
|
parser = etree.XMLParser(recover=True, no_network=True)
|
||||||
raw = stream.read()
|
raw = stream.read()
|
||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
|
||||||
assume_utf8=True)[0]
|
|
||||||
root = etree.fromstring(raw, parser=parser)
|
root = etree.fromstring(raw, parser=parser)
|
||||||
authors, author_sort = [], None
|
return root
|
||||||
for au in XPath('//fb2:author')(root):
|
|
||||||
fname = lname = author = None
|
|
||||||
fe = XPath('descendant::fb2:first-name')(au)
|
|
||||||
if fe:
|
|
||||||
fname = tostring(fe[0])
|
|
||||||
author = fname
|
|
||||||
le = XPath('descendant::fb2:last-name')(au)
|
|
||||||
if le:
|
|
||||||
lname = tostring(le[0])
|
|
||||||
if author:
|
|
||||||
author += ' '+lname
|
|
||||||
else:
|
|
||||||
author = lname
|
|
||||||
if author:
|
|
||||||
authors.append(author)
|
|
||||||
if len(authors) == 1 and author is not None:
|
|
||||||
if lname:
|
|
||||||
author_sort = lname
|
|
||||||
if fname:
|
|
||||||
if author_sort: author_sort += ', '+fname
|
|
||||||
else: author_sort = fname
|
|
||||||
title = os.path.splitext(os.path.basename(getattr(stream, 'name',
|
|
||||||
_('Unknown'))))[0]
|
|
||||||
for x in XPath('//fb2:book-title')(root):
|
|
||||||
title = tostring(x)
|
|
||||||
break
|
|
||||||
comments = ''
|
|
||||||
for x in XPath('//fb2:annotation')(root):
|
|
||||||
comments += tostring(x)
|
|
||||||
if not comments:
|
|
||||||
comments = None
|
|
||||||
tags = list(map(tostring, XPath('//fb2:genre')(root)))
|
|
||||||
|
|
||||||
cp = XPath('//fb2:coverpage')(root)
|
|
||||||
cdata = None
|
|
||||||
if cp:
|
|
||||||
cimage = XPath('descendant::fb2:image[@xlink:href]')(cp[0])
|
|
||||||
if cimage:
|
|
||||||
id = cimage[0].get(XLINK('href')).replace('#', '')
|
|
||||||
binary = XPath('//fb2:binary[@id="%s"]'%id)(root)
|
|
||||||
if binary:
|
|
||||||
mt = binary[0].get('content-type', 'image/jpeg')
|
|
||||||
exts = guess_all_extensions(mt)
|
|
||||||
if not exts:
|
|
||||||
exts = ['.jpg']
|
|
||||||
cdata = (exts[0][1:], b64decode(tostring(binary[0])))
|
|
||||||
|
|
||||||
series = None
|
|
||||||
series_index = 1.0
|
|
||||||
for x in XPath('//fb2:sequence')(root):
|
|
||||||
series = x.get('name', None)
|
|
||||||
if series is not None:
|
|
||||||
series_index = x.get('number', 1.0)
|
|
||||||
break
|
|
||||||
mi = MetaInformation(title, authors)
|
|
||||||
mi.comments = comments
|
|
||||||
mi.author_sort = author_sort
|
|
||||||
if tags:
|
|
||||||
mi.tags = tags
|
|
||||||
mi.series = series
|
|
||||||
mi.series_index = series_index
|
|
||||||
if cdata:
|
|
||||||
mi.cover_data = cdata
|
|
||||||
return mi
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user