FB2 Metadata Reader: Fix cover extraction, author_sort extraction, support multiple authors. Improved Irish Times recipe

This commit is contained in:
Kovid Goyal 2009-08-12 11:45:30 -06:00
parent 49ab346619
commit c30fb96ebf
2 changed files with 99 additions and 65 deletions

View File

@ -5,50 +5,82 @@ __copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
'''Read meta information from fb2 files''' '''Read meta information from fb2 files'''
import mimetypes import mimetypes, os
from base64 import b64decode from base64 import b64decode
from lxml import etree
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
XLINK_NS = 'http://www.w3.org/1999/xlink'
def XLINK(name):
return '{%s}%s' % (XLINK_NS, name)
def get_metadata(stream): def get_metadata(stream):
""" Return metadata as a L{MetaInfo} object """ """ Return metadata as a L{MetaInfo} object """
soup = BeautifulStoneSoup(stream.read()) XPath = lambda x : etree.XPath(x,
firstname = soup.find("first-name").contents[0] namespaces={'fb2':'http://www.gribuser.ru/xml/fictionbook/2.0',
lastname = soup.find("last-name").contents[0] 'xlink':XLINK_NS})
author= [firstname+" "+lastname] tostring = lambda x : etree.tostring(x, method='text',
title = soup.find("book-title").string encoding=unicode).strip()
comments = soup.find("annotation") root = etree.fromstring(stream.read())
tags = soup.findAll('genre') authors, author_sort = [], None
tags = [t.contents[0] for t in tags] for au in XPath('//fb2:author')(root):
cp = soup.find('coverpage') fname = lname = author = None
fe = XPath('descendant::fb2:first-name')(au)
if fe:
fname = tostring(fe[0])
author = fname
le = XPath('descendant::fb2:last-name')(au)
if le:
lname = tostring(le[0])
author += ' '+lname
if author:
authors.append(author)
if len(authors) == 1 and author is not None:
if lname:
author_sort = lname
if fname:
if author_sort: author_sort += ', '+fname
else: author_sort = fname
title = os.path.splitext(os.path.basename(getattr(stream, 'name',
_('Unknown'))))[0]
for x in XPath('//fb2:book-title')(root):
title = tostring(x)
comments = ''
for x in XPath('//fb2:annotation')(root):
comments += tostring(x)
if not comments:
comments = None
tags = list(map(tostring, XPath('//fb2:genre')(root)))
cp = XPath('//fb2:coverpage')(root)
cdata = None cdata = None
if cp: if cp:
cimage = cp.find('image', attrs={'l:href':True}) cimage = XPath('descendant::fb2:image[@xlink:href]')(cp[0])
if cimage: if cimage:
id = cimage['l:href'].replace('#', '') id = cimage[0].get(XLINK('href')).replace('#', '')
binary = soup.find('binary', id=id, attrs={'content-type':True}) binary = XPath('//fb2:binary[@id="%s"]'%id)(root)
if binary: if binary:
mt = binary['content-type'] mt = binary[0].get('content-type', 'image/jpeg')
exts = mimetypes.guess_all_extensions(mt) exts = mimetypes.guess_all_extensions(mt)
if not exts: if not exts:
exts = ['.jpg'] exts = ['.jpg']
cdata = (exts[0][1:], b64decode(binary.string.strip())) cdata = (exts[0][1:], b64decode(tostring(binary[0])))
if comments: series = None
comments = u''.join(comments.findAll(text=True)) series_index = 1.0
series = soup.find("sequence") for x in XPath('//fb2:sequence')(root):
mi = MetaInformation(title, author) series = x.get('name', None)
if series is not None:
series_index = x.get('number', 1.0)
break
mi = MetaInformation(title, authors)
mi.comments = comments mi.comments = comments
mi.author_sort = lastname+'; '+firstname mi.author_sort = author_sort
if tags: if tags:
mi.tags = tags mi.tags = tags
if series: mi.series = series
mi.series = series.get('name', None) mi.series_index = series_index
try:
mi.series_index = float(series.get('number', None))
except (TypeError, ValueError):
pass
if cdata: if cdata:
mi.cover_data = cdata mi.cover_data = cdata
return mi return mi

View File

@ -1,17 +1,20 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Derry FitzGerald' __copyright__ = '2008, Derry FitzGerald. 2009 Modified by Ray Kinsella'
''' '''
irishtimes.com irishtimes.com
''' '''
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class IrishTimes(BasicNewsRecipe): class IrishTimes(BasicNewsRecipe):
title = u'The Irish Times' title = u'The Irish Times'
__author__ = 'Derry FitzGerald' __author__ = 'Derry FitzGerald and Ray Kinsella'
language = _('English') language = _('English')
no_stylesheets = True no_stylesheets = True
simultaneous_downloads= 1
r = re.compile('.*(?P<url>http:\/\/www.irishtimes.com\/.*\.html).*')
remove_tags = [dict(name='div', attrs={'class':'footer'})] remove_tags = [dict(name='div', attrs={'class':'footer'})]
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }' extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
@ -24,14 +27,13 @@ class IrishTimes(BasicNewsRecipe):
('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'), ('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'), ('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'), ('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
('Health', 'http://www.irishtimes.com/feeds/rss/newspaper/health.rss'),
('Education and Parenting', 'http://www.irishtimes.com/feeds/rss/newspaper/education.rss'),
('Science Today', 'http://www.irishtimes.com/feeds/rss/newspaper/sciencetoday.rss'),
('The Ticket', 'http://www.irishtimes.com/feeds/rss/newspaper/theticket.rss'),
('Weekend', 'http://www.irishtimes.com/feeds/rss/newspaper/weekend.rss'),
('News Features', 'http://www.irishtimes.com/feeds/rss/newspaper/newsfeatures.rss'),
('Magazine', 'http://www.irishtimes.com/feeds/rss/newspaper/magazine.rss'),
] ]
def print_version(self, url): def print_version(self, url):
return url.replace('.html', '_pf.html') return url.replace('.html', '_pf.html')
def get_article_url(self, article):
m = self.r.match(article.get('description', None))
print m.group('url')
return m.group('url')