FB2 Metadata Reader: Fix cover extraction, author_sort extraction, support multiple authors. Improved Irish Times recipe

2026-06-07 22:45:32 -04:00 · 2009-08-12 11:45:30 -06:00
parent 49ab346619
commit c30fb96ebf
2 changed files with 99 additions and 65 deletions
@@ -5,50 +5,82 @@ __copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'

 '''Read meta information from fb2 files'''

-import mimetypes
+import mimetypes, os
 from base64 import b64decode
-
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
+from lxml import etree
 from calibre.ebooks.metadata import MetaInformation

+XLINK_NS     = 'http://www.w3.org/1999/xlink'
+def XLINK(name):
+    return '{%s}%s' % (XLINK_NS, name)
+
+
 def get_metadata(stream):
    """ Return metadata as a L{MetaInfo} object """
-    soup =  BeautifulStoneSoup(stream.read())
-    firstname = soup.find("first-name").contents[0]
-    lastname = soup.find("last-name").contents[0]
-    author= [firstname+" "+lastname]
-    title = soup.find("book-title").string
-    comments = soup.find("annotation")
-    tags = soup.findAll('genre')
-    tags = [t.contents[0] for t in tags]
-    cp = soup.find('coverpage')
+    XPath = lambda x : etree.XPath(x,
+            namespaces={'fb2':'http://www.gribuser.ru/xml/fictionbook/2.0',
+                'xlink':XLINK_NS})
+    tostring = lambda x : etree.tostring(x, method='text',
+            encoding=unicode).strip()
+    root = etree.fromstring(stream.read())
+    authors, author_sort = [], None
+    for au in XPath('//fb2:author')(root):
+        fname = lname = author = None
+        fe = XPath('descendant::fb2:first-name')(au)
+        if fe:
+            fname = tostring(fe[0])
+            author = fname
+        le = XPath('descendant::fb2:last-name')(au)
+        if le:
+            lname = tostring(le[0])
+            author += ' '+lname
+        if author:
+            authors.append(author)
+        if len(authors) == 1 and author is not None:
+            if lname:
+                author_sort = lname
+            if fname:
+                if author_sort: author_sort += ', '+fname
+                else: author_sort = fname
+    title = os.path.splitext(os.path.basename(getattr(stream, 'name',
+        _('Unknown'))))[0]
+    for x in XPath('//fb2:book-title')(root):
+        title = tostring(x)
+    comments = ''
+    for x in XPath('//fb2:annotation')(root):
+        comments += tostring(x)
+    if not comments:
+        comments = None
+    tags = list(map(tostring, XPath('//fb2:genre')(root)))
+
+    cp = XPath('//fb2:coverpage')(root)
    cdata = None
    if cp:
-        cimage = cp.find('image', attrs={'l:href':True})
+        cimage = XPath('descendant::fb2:image[@xlink:href]')(cp[0])
        if cimage:
-            id = cimage['l:href'].replace('#', '')
-            binary = soup.find('binary', id=id, attrs={'content-type':True})
+            id = cimage[0].get(XLINK('href')).replace('#', '')
+            binary = XPath('//fb2:binary[@id="%s"]'%id)(root)
            if binary:
-                mt = binary['content-type']
+                mt = binary[0].get('content-type', 'image/jpeg')
                exts = mimetypes.guess_all_extensions(mt)
                if not exts:
                    exts = ['.jpg']
-                cdata = (exts[0][1:], b64decode(binary.string.strip()))
+                cdata = (exts[0][1:], b64decode(tostring(binary[0])))

-    if comments:
-        comments = u''.join(comments.findAll(text=True))
-    series = soup.find("sequence")
-    mi = MetaInformation(title, author)
+    series = None
+    series_index = 1.0
+    for x in XPath('//fb2:sequence')(root):
+        series = x.get('name', None)
+        if series is not None:
+            series_index = x.get('number', 1.0)
+            break
+    mi = MetaInformation(title, authors)
    mi.comments = comments
-    mi.author_sort = lastname+'; '+firstname
+    mi.author_sort = author_sort
    if tags:
        mi.tags = tags
-    if series:
-        mi.series = series.get('name', None)
-        try:
-            mi.series_index = float(series.get('number', None))
-        except (TypeError, ValueError):
-            pass
+    mi.series = series
+    mi.series_index = series_index
    if cdata:
        mi.cover_data = cdata
    return mi
@@ -1,37 +1,39 @@
-__license__   = 'GPL v3'
-__copyright__ = '2008, Derry FitzGerald'
-'''
-irishtimes.com
-'''
-
-from calibre.web.feeds.news import BasicNewsRecipe
-
-class IrishTimes(BasicNewsRecipe):
-    title          = u'The Irish Times'
-    __author__     = 'Derry FitzGerald'
-    language = _('English')
-    no_stylesheets = True
-
-    remove_tags    = [dict(name='div', attrs={'class':'footer'})]
-    extra_css      = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt  }' 
-
-    feeds          = [
-                      ('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'), 
-                      ('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'),
-                      ('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'),
-                      ('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'),
-                      ('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'),
-                      ('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
-                      ('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
-                      ('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
-                      ('Health', 'http://www.irishtimes.com/feeds/rss/newspaper/health.rss'),
-                      ('Education and Parenting', 'http://www.irishtimes.com/feeds/rss/newspaper/education.rss'),
-                      ('Science Today', 'http://www.irishtimes.com/feeds/rss/newspaper/sciencetoday.rss'),
-                      ('The Ticket', 'http://www.irishtimes.com/feeds/rss/newspaper/theticket.rss'),
-                      ('Weekend', 'http://www.irishtimes.com/feeds/rss/newspaper/weekend.rss'),
-                      ('News Features', 'http://www.irishtimes.com/feeds/rss/newspaper/newsfeatures.rss'),
-                      ('Magazine', 'http://www.irishtimes.com/feeds/rss/newspaper/magazine.rss'),
-                    ]
-
-    def print_version(self, url):
-        return url.replace('.html', '_pf.html')
+__license__   = 'GPL v3'
+__copyright__ = '2008, Derry FitzGerald. 2009 Modified by Ray Kinsella'
+'''
+irishtimes.com
+'''
+import re
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class IrishTimes(BasicNewsRecipe):
+    title          = u'The Irish Times'
+    __author__     = 'Derry FitzGerald and Ray Kinsella'
+    language = _('English')
+    no_stylesheets = True
+    simultaneous_downloads= 1
+	
+    r 			   = re.compile('.*(?P<url>http:\/\/www.irishtimes.com\/.*\.html).*')
+    remove_tags    = [dict(name='div', attrs={'class':'footer'})]
+    extra_css      = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt  }' 
+
+    feeds          = [
+                      ('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'), 
+                      ('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'),
+                      ('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'),
+                      ('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'),
+                      ('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'),
+                      ('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
+                      ('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
+                      ('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
+                    ]
+	
+
+    def print_version(self, url):
+        return url.replace('.html', '_pf.html')
+		
+    def get_article_url(self, article):
+        m = self.r.match(article.get('description',  None))
+        print m.group('url')
+        return m.group('url')