Improved FB2 metadata reading, by Roman Mukhin

This commit is contained in:
Kovid Goyal 2011-06-23 17:06:37 -06:00
parent 62cfdc8023
commit 2a9e73ff12

View File

@ -1,96 +1,235 @@
#!/usr/bin/env python #!/usr/bin/env python
from __future__ import with_statement from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>' __copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>, '\
'2008, Anatoly Shipitsin <norguhtar at gmail.com>'
'''Read meta information from fb2 files''' '''Read meta information from fb2 files'''
import os import os
import datetime
from functools import partial
from base64 import b64decode from base64 import b64decode
from lxml import etree from lxml import etree
from calibre.ebooks.metadata import MetaInformation from calibre.utils.date import parse_date
from calibre import guess_all_extensions, prints, force_unicode
from calibre.ebooks.metadata import MetaInformation, check_isbn
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre import guess_all_extensions
XLINK_NS = 'http://www.w3.org/1999/xlink'
def XLINK(name):
return '{%s}%s' % (XLINK_NS, name)
NAMESPACES = {
'fb2' : 'http://www.gribuser.ru/xml/fictionbook/2.0',
'xlink' : 'http://www.w3.org/1999/xlink' }
XPath = partial(etree.XPath, namespaces=NAMESPACES)
tostring = partial(etree.tostring, method='text', encoding=unicode)
def get_metadata(stream): def get_metadata(stream):
""" Return metadata as a L{MetaInfo} object """ """ Return fb2 metadata as a L{MetaInformation} object """
XPath = lambda x : etree.XPath(x,
namespaces={'fb2':'http://www.gribuser.ru/xml/fictionbook/2.0', root = _get_fbroot(stream)
'xlink':XLINK_NS})
tostring = lambda x : etree.tostring(x, method='text', book_title = _parse_book_title(root)
encoding=unicode).strip() authors = _parse_authors(root)
parser = etree.XMLParser(recover=True, no_network=True)
raw = stream.read() # fallback for book_title
raw = xml_to_unicode(raw, strip_encoding_pats=True, if book_title:
assume_utf8=True)[0] book_title = unicode(book_title)
root = etree.fromstring(raw, parser=parser)
authors, author_sort = [], None
for au in XPath('//fb2:author')(root):
fname = lname = author = None
fe = XPath('descendant::fb2:first-name')(au)
if fe:
fname = tostring(fe[0])
author = fname
le = XPath('descendant::fb2:last-name')(au)
if le:
lname = tostring(le[0])
if author:
author += ' '+lname
else: else:
author = lname book_title = force_unicode(os.path.splitext(
os.path.basename(getattr(stream, 'name',
_('Unknown'))))[0])
mi = MetaInformation(book_title, authors)
try:
_parse_cover(root, mi)
except:
pass
try:
_parse_comments(root, mi)
except:
pass
try:
_parse_tags(root, mi)
except:
pass
try:
_parse_series(root, mi)
except:
pass
try:
_parse_isbn(root, mi)
except:
pass
try:
_parse_publisher(root, mi)
except:
pass
try:
_parse_pubdate(root, mi)
except:
pass
try:
_parse_timestamp(root, mi)
except:
pass
try:
_parse_language(root, mi)
except:
pass
#_parse_uuid(root, mi)
#if DEBUG:
# prints(mi)
return mi
def _parse_authors(root):
authors = []
# pick up authors but only from 1 secrion <title-info>; otherwise it is not consistent!
# Those are fallbacks: <src-title-info>, <document-info>
for author_sec in ['title-info', 'src-title-info', 'document-info']:
for au in XPath('//fb2:%s/fb2:author'%author_sec)(root):
author = _parse_author(au)
if author: if author:
authors.append(author) authors.append(author)
if len(authors) == 1 and author is not None: if author:
break
# if no author so far
if not authors:
authors.append(_('Unknown'))
return authors
def _parse_author(elm_author):
""" Returns a list of display author and sortable author"""
xp_templ = 'normalize-space(fb2:%s/text())'
author = XPath(xp_templ % 'first-name')(elm_author)
lname = XPath(xp_templ % 'last-name')(elm_author)
mname = XPath(xp_templ % 'middle-name')(elm_author)
if mname:
author = (author + ' ' + mname).strip()
if lname: if lname:
author_sort = lname author = (author + ' ' + lname).strip()
if fname:
if author_sort: author_sort += ', '+fname
else: author_sort = fname
title = os.path.splitext(os.path.basename(getattr(stream, 'name',
_('Unknown'))))[0]
for x in XPath('//fb2:book-title')(root):
title = tostring(x)
break
comments = ''
for x in XPath('//fb2:annotation')(root):
comments += tostring(x)
if not comments:
comments = None
tags = list(map(tostring, XPath('//fb2:genre')(root)))
cp = XPath('//fb2:coverpage')(root) # fallback to nickname
cdata = None if not author:
if cp: nname = XPath(xp_templ % 'nickname')(elm_author)
cimage = XPath('descendant::fb2:image[@xlink:href]')(cp[0]) if nname:
if cimage: author = nname
id = cimage[0].get(XLINK('href')).replace('#', '')
binary = XPath('//fb2:binary[@id="%s"]'%id)(root)
if binary:
mt = binary[0].get('content-type', 'image/jpeg')
exts = guess_all_extensions(mt)
if not exts:
exts = ['.jpg']
cdata = (exts[0][1:], b64decode(tostring(binary[0])))
series = None return author
series_index = 1.0
for x in XPath('//fb2:sequence')(root):
series = x.get('name', None) def _parse_book_title(root):
if series is not None: # <title-info> has a priority. (actually <title-info> is mandatory)
series_index = x.get('number', 1.0) # other are backup solution (sequence is important. other then in fb2-doc)
break xp_ti = '//fb2:title-info/fb2:book-title/text()'
mi = MetaInformation(title, authors) xp_pi = '//fb2:publish-info/fb2:book-title/text()'
mi.comments = comments xp_si = '//fb2:src-title-info/fb2:book-title/text()'
mi.author_sort = author_sort book_title = XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root)
return book_title
def _parse_cover(root, mi):
# pickup from <title-info>, if not exists it fallbacks to <src-title-info>
imgid = XPath('substring-after(string(//fb2:coverpage/fb2:image/@xlink:href), "#")')(root)
if imgid:
try:
_parse_cover_data(root, imgid, mi)
except:
pass
def _parse_cover_data(root, imgid, mi):
elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root)
if elm_binary:
mimetype = elm_binary[0].get('content-type', 'image/jpeg')
mime_extensions = guess_all_extensions(mimetype)
if mime_extensions:
pic_data = elm_binary[0].text
if pic_data:
mi.cover_data = (mime_extensions[0][1:], b64decode(pic_data))
else:
prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid) )
def _parse_tags(root, mi):
# pick up genre but only from 1 secrion <title-info>; otherwise it is not consistent!
# Those are fallbacks: <src-title-info>
for genre_sec in ['title-info', 'src-title-info']:
# -- i18n Translations-- ?
tags = XPath('//fb2:%s/fb2:genre/text()' % genre_sec)(root)
if tags: if tags:
mi.tags = tags mi.tags = list(map(unicode, tags))
mi.series = series break
mi.series_index = series_index
if cdata: def _parse_series(root, mi):
mi.cover_data = cdata #calibri supports only 1 series: use the 1-st one
return mi # pick up sequence but only from 1 secrion in prefered order
# except <src-title-info>
xp_ti = '//fb2:title-info/fb2:sequence[1]'
xp_pi = '//fb2:publish-info/fb2:sequence[1]'
elms_sequence = XPath('%s|%s' % (xp_ti, xp_pi))(root)
if elms_sequence:
mi.series = elms_sequence[0].get('name', None)
if mi.series:
mi.series_index = elms_sequence[0].get('number', None)
def _parse_isbn(root, mi):
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
isbn = XPath('normalize-space(//fb2:publish-info/fb2:isbn/text())')(root)
# some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
if ',' in isbn:
isbn = isbn[:isbn.index(',')]
if check_isbn(isbn):
mi.isbn = isbn
def _parse_comments(root, mi):
# pick up annotation but only from 1 secrion <title-info>; fallback: <src-title-info>
for annotation_sec in ['title-info', 'src-title-info']:
elms_annotation = XPath('//fb2:%s/fb2:annotation' % annotation_sec)(root)
if elms_annotation:
mi.comments = tostring(elms_annotation[0])
# TODO: tags i18n, xslt?
break
def _parse_publisher(root, mi):
publisher = XPath('string(//fb2:publish-info/fb2:publisher/text())')(root)
if publisher:
mi.publisher = publisher
def _parse_pubdate(root, mi):
year = XPath('number(//fb2:publish-info/fb2:year/text())')(root)
if float.is_integer(year):
# only year is available, so use 1-st of Jan
mi.pubdate = datetime.date(int(year), 1, 1)
def _parse_timestamp(root, mi):
#<date value="1996-12-03">03.12.1996</date>
xp ='//fb2:document-info/fb2:date/@value|'\
'//fb2:document-info/fb2:date/text()'
docdate = XPath('string(%s)' % xp)(root)
if docdate:
mi.timestamp = parse_date(docdate)
def _parse_language(root, mi):
language = XPath('string(//fb2:title-info/fb2:lang/text())')(root)
if language:
mi.language = language
mi.languages = [ language ]
def _parse_uuid(root, mi):
uuid = XPath('normalize-space(//document-info/fb2:id/text())')(root)
if uuid:
mi.uuid = uuid
def _get_fbroot(stream):
parser = etree.XMLParser(recover=True, no_network=True)
raw = stream.read()
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
root = etree.fromstring(raw, parser=parser)
return root