Improved FB2 metadata reading, by Roman Mukhin

2025-12-11 07:35:14 -05:00 · 2011-06-23 17:06:37 -06:00 · 2011-06-23 17:06:37 -06:00 · 2a9e73ff12
commit 2a9e73ff12
parent 62cfdc8023
1 changed files with 218 additions and 79 deletions
--- a/src/calibre/ebooks/metadata/fb2.py
+++ b/src/calibre/ebooks/metadata/fb2.py
@ -1,96 +1,235 @@
 #!/usr/bin/env python
 from __future__ import with_statement
 __license__   = 'GPL v3'
-__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
-
+__copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>, '\
+                '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
 '''Read meta information from fb2 files'''

 import os
+import datetime
+from functools import partial
 from base64 import b64decode
 from lxml import etree
-from calibre.ebooks.metadata import MetaInformation
+from calibre.utils.date import parse_date
+from calibre import guess_all_extensions, prints, force_unicode
+from calibre.ebooks.metadata import MetaInformation, check_isbn
 from calibre.ebooks.chardet import xml_to_unicode
-from calibre import guess_all_extensions

-XLINK_NS     = 'http://www.w3.org/1999/xlink'
-def XLINK(name):
-    return '{%s}%s' % (XLINK_NS, name)

+NAMESPACES = {
+    'fb2'   :   'http://www.gribuser.ru/xml/fictionbook/2.0',
+    'xlink' :   'http://www.w3.org/1999/xlink'  }
+
+XPath = partial(etree.XPath, namespaces=NAMESPACES)
+tostring = partial(etree.tostring, method='text', encoding=unicode)

 def get_metadata(stream):
-    """ Return metadata as a L{MetaInfo} object """
-    XPath = lambda x : etree.XPath(x,
-            namespaces={'fb2':'http://www.gribuser.ru/xml/fictionbook/2.0',
-                'xlink':XLINK_NS})
-    tostring = lambda x : etree.tostring(x, method='text',
-            encoding=unicode).strip()
+    """ Return fb2 metadata as a L{MetaInformation} object """
+
+    root = _get_fbroot(stream)
+
+    book_title = _parse_book_title(root)
+    authors = _parse_authors(root)
+
+    # fallback for book_title
+    if book_title:
+        book_title = unicode(book_title)
+    else:
+        book_title = force_unicode(os.path.splitext(
+            os.path.basename(getattr(stream, 'name',
+                _('Unknown'))))[0])
+    mi = MetaInformation(book_title, authors)
+
+    try:
+        _parse_cover(root, mi)
+    except:
+        pass
+    try:
+        _parse_comments(root, mi)
+    except:
+        pass
+    try:
+        _parse_tags(root, mi)
+    except:
+        pass
+    try:
+        _parse_series(root, mi)
+    except:
+        pass
+    try:
+        _parse_isbn(root, mi)
+    except:
+        pass
+    try:
+        _parse_publisher(root, mi)
+    except:
+        pass
+    try:
+        _parse_pubdate(root, mi)
+    except:
+        pass
+    try:
+        _parse_timestamp(root, mi)
+    except:
+        pass
+
+    try:
+        _parse_language(root, mi)
+    except:
+        pass
+    #_parse_uuid(root, mi)
+
+    #if DEBUG:
+    #   prints(mi)
+    return mi
+
+def _parse_authors(root):
+    authors = []
+    # pick up authors but only from 1 secrion <title-info>; otherwise it is not consistent!
+    # Those are fallbacks: <src-title-info>, <document-info>
+    for author_sec in ['title-info', 'src-title-info', 'document-info']:
+        for au in XPath('//fb2:%s/fb2:author'%author_sec)(root):
+            author = _parse_author(au)
+            if author:
+                authors.append(author)
+        if author:
+            break
+
+    # if no author so far
+    if not authors:
+        authors.append(_('Unknown'))
+
+    return authors
+
+def _parse_author(elm_author):
+    """ Returns a list of display author and sortable author"""
+
+    xp_templ = 'normalize-space(fb2:%s/text())'
+
+    author = XPath(xp_templ % 'first-name')(elm_author)
+    lname = XPath(xp_templ % 'last-name')(elm_author)
+    mname = XPath(xp_templ % 'middle-name')(elm_author)
+
+    if mname:
+        author = (author + ' ' + mname).strip()
+    if lname:
+        author = (author + ' ' + lname).strip()
+
+    # fallback to nickname
+    if not author:
+        nname = XPath(xp_templ % 'nickname')(elm_author)
+        if nname:
+            author = nname
+
+    return author
+
+
+def _parse_book_title(root):
+    # <title-info> has a priority.   (actually <title-info>  is mandatory)
+    # other are backup solution (sequence is important. other then in fb2-doc)
+    xp_ti = '//fb2:title-info/fb2:book-title/text()'
+    xp_pi = '//fb2:publish-info/fb2:book-title/text()'
+    xp_si = '//fb2:src-title-info/fb2:book-title/text()'
+    book_title = XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root)
+
+    return book_title
+
+def _parse_cover(root, mi):
+    # pickup from <title-info>, if not exists it fallbacks to <src-title-info>
+    imgid = XPath('substring-after(string(//fb2:coverpage/fb2:image/@xlink:href), "#")')(root)
+    if imgid:
+        try:
+            _parse_cover_data(root, imgid, mi)
+        except:
+            pass
+
+def _parse_cover_data(root, imgid, mi):
+    elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root)
+    if elm_binary:
+        mimetype = elm_binary[0].get('content-type', 'image/jpeg')
+        mime_extensions = guess_all_extensions(mimetype)
+        if mime_extensions:
+            pic_data = elm_binary[0].text
+            if pic_data:
+                mi.cover_data = (mime_extensions[0][1:], b64decode(pic_data))
+        else:
+            prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid) )
+
+def _parse_tags(root, mi):
+    # pick up genre but only from 1 secrion <title-info>; otherwise it is not consistent!
+    # Those are fallbacks: <src-title-info>
+    for genre_sec in ['title-info', 'src-title-info']:
+        # -- i18n Translations-- ?
+        tags = XPath('//fb2:%s/fb2:genre/text()' % genre_sec)(root)
+        if tags:
+            mi.tags = list(map(unicode, tags))
+            break
+
+def _parse_series(root, mi):
+    #calibri supports only 1 series: use the 1-st one
+    # pick up sequence but only from 1 secrion in prefered order
+    # except <src-title-info>
+    xp_ti = '//fb2:title-info/fb2:sequence[1]'
+    xp_pi = '//fb2:publish-info/fb2:sequence[1]'
+
+    elms_sequence = XPath('%s|%s' % (xp_ti, xp_pi))(root)
+    if elms_sequence:
+        mi.series = elms_sequence[0].get('name', None)
+        if mi.series:
+            mi.series_index = elms_sequence[0].get('number', None)
+
+def _parse_isbn(root, mi):
+    # some people try to put several isbn in this field, but it is not allowed.  try to stick to the 1-st one in this case
+    isbn = XPath('normalize-space(//fb2:publish-info/fb2:isbn/text())')(root)
+    # some people try to put several isbn in this field, but it is not allowed.  try to stick to the 1-st one in this case
+    if ',' in isbn:
+        isbn = isbn[:isbn.index(',')]
+    if check_isbn(isbn):
+        mi.isbn = isbn
+
+def _parse_comments(root, mi):
+    # pick up annotation but only from 1 secrion <title-info>;  fallback: <src-title-info>
+    for annotation_sec in ['title-info', 'src-title-info']:
+        elms_annotation = XPath('//fb2:%s/fb2:annotation' % annotation_sec)(root)
+        if elms_annotation:
+            mi.comments = tostring(elms_annotation[0])
+            # TODO: tags i18n, xslt?
+            break
+
+def _parse_publisher(root, mi):
+    publisher = XPath('string(//fb2:publish-info/fb2:publisher/text())')(root)
+    if publisher:
+        mi.publisher = publisher
+
+def _parse_pubdate(root, mi):
+    year = XPath('number(//fb2:publish-info/fb2:year/text())')(root)
+    if float.is_integer(year):
+        # only year is available, so use 1-st of Jan
+        mi.pubdate = datetime.date(int(year), 1, 1)
+
+def _parse_timestamp(root, mi):
+    #<date value="1996-12-03">03.12.1996</date>
+    xp ='//fb2:document-info/fb2:date/@value|'\
+        '//fb2:document-info/fb2:date/text()'
+    docdate = XPath('string(%s)' % xp)(root)
+    if docdate:
+        mi.timestamp = parse_date(docdate)
+
+def _parse_language(root, mi):
+    language = XPath('string(//fb2:title-info/fb2:lang/text())')(root)
+    if language:
+        mi.language = language
+        mi.languages = [ language ]
+
+def _parse_uuid(root, mi):
+    uuid = XPath('normalize-space(//document-info/fb2:id/text())')(root)
+    if uuid:
+        mi.uuid = uuid
+
+def _get_fbroot(stream):
    parser = etree.XMLParser(recover=True, no_network=True)
    raw = stream.read()
-    raw = xml_to_unicode(raw, strip_encoding_pats=True,
-            assume_utf8=True)[0]
+    raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
    root = etree.fromstring(raw, parser=parser)
-    authors, author_sort = [], None
-    for au in XPath('//fb2:author')(root):
-        fname = lname = author = None
-        fe = XPath('descendant::fb2:first-name')(au)
-        if fe:
-            fname = tostring(fe[0])
-            author = fname
-        le = XPath('descendant::fb2:last-name')(au)
-        if le:
-            lname = tostring(le[0])
-            if author:
-                author += ' '+lname
-            else:
-                author = lname
-        if author:
-            authors.append(author)
-        if len(authors) == 1 and author is not None:
-            if lname:
-                author_sort = lname
-            if fname:
-                if author_sort: author_sort += ', '+fname
-                else: author_sort = fname
-    title = os.path.splitext(os.path.basename(getattr(stream, 'name',
-        _('Unknown'))))[0]
-    for x in XPath('//fb2:book-title')(root):
-        title = tostring(x)
-        break
-    comments = ''
-    for x in XPath('//fb2:annotation')(root):
-        comments += tostring(x)
-    if not comments:
-        comments = None
-    tags = list(map(tostring, XPath('//fb2:genre')(root)))
+    return root

-    cp = XPath('//fb2:coverpage')(root)
-    cdata = None
-    if cp:
-        cimage = XPath('descendant::fb2:image[@xlink:href]')(cp[0])
-        if cimage:
-            id = cimage[0].get(XLINK('href')).replace('#', '')
-            binary = XPath('//fb2:binary[@id="%s"]'%id)(root)
-            if binary:
-                mt = binary[0].get('content-type', 'image/jpeg')
-                exts = guess_all_extensions(mt)
-                if not exts:
-                    exts = ['.jpg']
-                cdata = (exts[0][1:], b64decode(tostring(binary[0])))
-
-    series = None
-    series_index = 1.0
-    for x in XPath('//fb2:sequence')(root):
-        series = x.get('name', None)
-        if series is not None:
-            series_index = x.get('number', 1.0)
-            break
-    mi = MetaInformation(title, authors)
-    mi.comments = comments
-    mi.author_sort = author_sort
-    if tags:
-        mi.tags = tags
-    mi.series = series
-    mi.series_index = series_index
-    if cdata:
-        mi.cover_data = cdata
-    return mi