FB2 Metadata Reader: Fix cover extraction, author_sort extraction, support multiple authors. Improved Irish Times recipe

2025-07-07 10:14:46 -04:00 · 2009-08-12 11:45:30 -06:00 · 2009-08-12 11:45:30 -06:00 · c30fb96ebf
commit c30fb96ebf
parent 49ab346619
2 changed files with 99 additions and 65 deletions
--- a/src/calibre/ebooks/metadata/fb2.py
+++ b/src/calibre/ebooks/metadata/fb2.py
@ -5,50 +5,82 @@ __copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
 '''Read meta information from fb2 files'''
-import mimetypes
+import mimetypes, os
 from base64 import b64decode
-
+from lxml import etree
 from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
 from calibre.ebooks.metadata import MetaInformation
 XLINK_NS     = 'http://www.w3.org/1999/xlink'
 def XLINK(name):
    return '{%s}%s' % (XLINK_NS, name)
 def get_metadata(stream):
    """ Return metadata as a L{MetaInfo} object """
-    soup =  BeautifulStoneSoup(stream.read())
+    XPath = lambda x : etree.XPath(x,
-    firstname = soup.find("first-name").contents[0]
+            namespaces={'fb2':'http://www.gribuser.ru/xml/fictionbook/2.0',
-    lastname = soup.find("last-name").contents[0]
+                'xlink':XLINK_NS})
-    author= [firstname+" "+lastname]
+    tostring = lambda x : etree.tostring(x, method='text',
-    title = soup.find("book-title").string
+            encoding=unicode).strip()
-    comments = soup.find("annotation")
+    root = etree.fromstring(stream.read())
-    tags = soup.findAll('genre')
+    authors, author_sort = [], None
-    tags = [t.contents[0] for t in tags]
+    for au in XPath('//fb2:author')(root):
-    cp = soup.find('coverpage')
+        fname = lname = author = None
        fe = XPath('descendant::fb2:first-name')(au)
        if fe:
            fname = tostring(fe[0])
            author = fname
        le = XPath('descendant::fb2:last-name')(au)
        if le:
            lname = tostring(le[0])
            author += ' '+lname
        if author:
            authors.append(author)
        if len(authors) == 1 and author is not None:
            if lname:
                author_sort = lname
            if fname:
                if author_sort: author_sort += ', '+fname
                else: author_sort = fname
    title = os.path.splitext(os.path.basename(getattr(stream, 'name',
        _('Unknown'))))[0]
    for x in XPath('//fb2:book-title')(root):
        title = tostring(x)
    comments = ''
    for x in XPath('//fb2:annotation')(root):
        comments += tostring(x)
    if not comments:
        comments = None
    tags = list(map(tostring, XPath('//fb2:genre')(root)))
    cp = XPath('//fb2:coverpage')(root)
    cdata = None
    if cp:
-        cimage = cp.find('image', attrs={'l:href':True})
+        cimage = XPath('descendant::fb2:image[@xlink:href]')(cp[0])
        if cimage:
-            id = cimage['l:href'].replace('#', '')
+            id = cimage[0].get(XLINK('href')).replace('#', '')
-            binary = soup.find('binary', id=id, attrs={'content-type':True})
+            binary = XPath('//fb2:binary[@id="%s"]'%id)(root)
            if binary:
-                mt = binary['content-type']
+                mt = binary[0].get('content-type', 'image/jpeg')
                exts = mimetypes.guess_all_extensions(mt)
                if not exts:
                    exts = ['.jpg']
-                cdata = (exts[0][1:], b64decode(binary.string.strip()))
+                cdata = (exts[0][1:], b64decode(tostring(binary[0])))
-    if comments:
+    series = None
-        comments = u''.join(comments.findAll(text=True))
+    series_index = 1.0
-    series = soup.find("sequence")
+    for x in XPath('//fb2:sequence')(root):
-    mi = MetaInformation(title, author)
+        series = x.get('name', None)
        if series is not None:
            series_index = x.get('number', 1.0)
            break
    mi = MetaInformation(title, authors)
    mi.comments = comments
-    mi.author_sort = lastname+'; '+firstname
+    mi.author_sort = author_sort
    if tags:
        mi.tags = tags
-    if series:
+    mi.series = series
-        mi.series = series.get('name', None)
+    mi.series_index = series_index
        try:
            mi.series_index = float(series.get('number', None))
        except (TypeError, ValueError):
            pass
    if cdata:
        mi.cover_data = cdata
    return mi
--- a/src/calibre/web/feeds/recipes/recipe_irish_times.py
+++ b/src/calibre/web/feeds/recipes/recipe_irish_times.py
@ -1,37 +1,39 @@
-__license__   = 'GPL v3'
+__license__   = 'GPL v3'
-__copyright__ = '2008, Derry FitzGerald'
+__copyright__ = '2008, Derry FitzGerald. 2009 Modified by Ray Kinsella'
-'''
+'''
-irishtimes.com
+irishtimes.com
-'''
+'''
-
+import re
-from calibre.web.feeds.news import BasicNewsRecipe
+
-
+from calibre.web.feeds.news import BasicNewsRecipe
-class IrishTimes(BasicNewsRecipe):
+
-    title          = u'The Irish Times'
+class IrishTimes(BasicNewsRecipe):
-    __author__     = 'Derry FitzGerald'
+    title          = u'The Irish Times'
-    language = _('English')
+    __author__     = 'Derry FitzGerald and Ray Kinsella'
-    no_stylesheets = True
+    language = _('English')
-
+    no_stylesheets = True
-    remove_tags    = [dict(name='div', attrs={'class':'footer'})]
+    simultaneous_downloads= 1
-    extra_css      = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt  }' 
+	
-
+    r 			   = re.compile('.*(?P<url>http:\/\/www.irishtimes.com\/.*\.html).*')
-    feeds          = [
+    remove_tags    = [dict(name='div', attrs={'class':'footer'})]
-                      ('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'), 
+    extra_css      = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt  }' 
-                      ('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'),
+
-                      ('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'),
+    feeds          = [
-                      ('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'),
+                      ('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'), 
-                      ('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'),
+                      ('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'),
-                      ('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
+                      ('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'),
-                      ('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
+                      ('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'),
-                      ('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
+                      ('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'),
-                      ('Health', 'http://www.irishtimes.com/feeds/rss/newspaper/health.rss'),
+                      ('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
-                      ('Education and Parenting', 'http://www.irishtimes.com/feeds/rss/newspaper/education.rss'),
+                      ('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
-                      ('Science Today', 'http://www.irishtimes.com/feeds/rss/newspaper/sciencetoday.rss'),
+                      ('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
-                      ('The Ticket', 'http://www.irishtimes.com/feeds/rss/newspaper/theticket.rss'),
+                    ]
-                      ('Weekend', 'http://www.irishtimes.com/feeds/rss/newspaper/weekend.rss'),
+	
-                      ('News Features', 'http://www.irishtimes.com/feeds/rss/newspaper/newsfeatures.rss'),
+
-                      ('Magazine', 'http://www.irishtimes.com/feeds/rss/newspaper/magazine.rss'),
+    def print_version(self, url):
-                    ]
+        return url.replace('.html', '_pf.html')
-
+		
-    def print_version(self, url):
+    def get_article_url(self, article):
-        return url.replace('.html', '_pf.html')
+        m = self.r.match(article.get('description',  None))
        print m.group('url')
        return m.group('url')