FB2 Metadata Reader: Fix cover extraction, author_sort extraction, support multiple authors. Improved Irish Times recipe

This commit is contained in:
Kovid Goyal 2009-08-12 11:45:30 -06:00
parent 49ab346619
commit c30fb96ebf
2 changed files with 99 additions and 65 deletions

View File

@ -5,50 +5,82 @@ __copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
'''Read meta information from fb2 files''' '''Read meta information from fb2 files'''
import mimetypes import mimetypes, os
from base64 import b64decode from base64 import b64decode
from lxml import etree
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
XLINK_NS = 'http://www.w3.org/1999/xlink'
def XLINK(name):
return '{%s}%s' % (XLINK_NS, name)
def get_metadata(stream): def get_metadata(stream):
""" Return metadata as a L{MetaInfo} object """ """ Return metadata as a L{MetaInfo} object """
soup = BeautifulStoneSoup(stream.read()) XPath = lambda x : etree.XPath(x,
firstname = soup.find("first-name").contents[0] namespaces={'fb2':'http://www.gribuser.ru/xml/fictionbook/2.0',
lastname = soup.find("last-name").contents[0] 'xlink':XLINK_NS})
author= [firstname+" "+lastname] tostring = lambda x : etree.tostring(x, method='text',
title = soup.find("book-title").string encoding=unicode).strip()
comments = soup.find("annotation") root = etree.fromstring(stream.read())
tags = soup.findAll('genre') authors, author_sort = [], None
tags = [t.contents[0] for t in tags] for au in XPath('//fb2:author')(root):
cp = soup.find('coverpage') fname = lname = author = None
fe = XPath('descendant::fb2:first-name')(au)
if fe:
fname = tostring(fe[0])
author = fname
le = XPath('descendant::fb2:last-name')(au)
if le:
lname = tostring(le[0])
author += ' '+lname
if author:
authors.append(author)
if len(authors) == 1 and author is not None:
if lname:
author_sort = lname
if fname:
if author_sort: author_sort += ', '+fname
else: author_sort = fname
title = os.path.splitext(os.path.basename(getattr(stream, 'name',
_('Unknown'))))[0]
for x in XPath('//fb2:book-title')(root):
title = tostring(x)
comments = ''
for x in XPath('//fb2:annotation')(root):
comments += tostring(x)
if not comments:
comments = None
tags = list(map(tostring, XPath('//fb2:genre')(root)))
cp = XPath('//fb2:coverpage')(root)
cdata = None cdata = None
if cp: if cp:
cimage = cp.find('image', attrs={'l:href':True}) cimage = XPath('descendant::fb2:image[@xlink:href]')(cp[0])
if cimage: if cimage:
id = cimage['l:href'].replace('#', '') id = cimage[0].get(XLINK('href')).replace('#', '')
binary = soup.find('binary', id=id, attrs={'content-type':True}) binary = XPath('//fb2:binary[@id="%s"]'%id)(root)
if binary: if binary:
mt = binary['content-type'] mt = binary[0].get('content-type', 'image/jpeg')
exts = mimetypes.guess_all_extensions(mt) exts = mimetypes.guess_all_extensions(mt)
if not exts: if not exts:
exts = ['.jpg'] exts = ['.jpg']
cdata = (exts[0][1:], b64decode(binary.string.strip())) cdata = (exts[0][1:], b64decode(tostring(binary[0])))
if comments: series = None
comments = u''.join(comments.findAll(text=True)) series_index = 1.0
series = soup.find("sequence") for x in XPath('//fb2:sequence')(root):
mi = MetaInformation(title, author) series = x.get('name', None)
if series is not None:
series_index = x.get('number', 1.0)
break
mi = MetaInformation(title, authors)
mi.comments = comments mi.comments = comments
mi.author_sort = lastname+'; '+firstname mi.author_sort = author_sort
if tags: if tags:
mi.tags = tags mi.tags = tags
if series: mi.series = series
mi.series = series.get('name', None) mi.series_index = series_index
try:
mi.series_index = float(series.get('number', None))
except (TypeError, ValueError):
pass
if cdata: if cdata:
mi.cover_data = cdata mi.cover_data = cdata
return mi return mi

View File

@ -1,37 +1,39 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Derry FitzGerald' __copyright__ = '2008, Derry FitzGerald. 2009 Modified by Ray Kinsella'
''' '''
irishtimes.com irishtimes.com
''' '''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class IrishTimes(BasicNewsRecipe):
title = u'The Irish Times' class IrishTimes(BasicNewsRecipe):
__author__ = 'Derry FitzGerald' title = u'The Irish Times'
language = _('English') __author__ = 'Derry FitzGerald and Ray Kinsella'
no_stylesheets = True language = _('English')
no_stylesheets = True
remove_tags = [dict(name='div', attrs={'class':'footer'})] simultaneous_downloads= 1
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
r = re.compile('.*(?P<url>http:\/\/www.irishtimes.com\/.*\.html).*')
feeds = [ remove_tags = [dict(name='div', attrs={'class':'footer'})]
('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'), extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'),
('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'), feeds = [
('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'), ('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'),
('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'), ('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'),
('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'), ('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'),
('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'), ('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'),
('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'), ('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'),
('Health', 'http://www.irishtimes.com/feeds/rss/newspaper/health.rss'), ('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
('Education and Parenting', 'http://www.irishtimes.com/feeds/rss/newspaper/education.rss'), ('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
('Science Today', 'http://www.irishtimes.com/feeds/rss/newspaper/sciencetoday.rss'), ('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
('The Ticket', 'http://www.irishtimes.com/feeds/rss/newspaper/theticket.rss'), ]
('Weekend', 'http://www.irishtimes.com/feeds/rss/newspaper/weekend.rss'),
('News Features', 'http://www.irishtimes.com/feeds/rss/newspaper/newsfeatures.rss'),
('Magazine', 'http://www.irishtimes.com/feeds/rss/newspaper/magazine.rss'), def print_version(self, url):
] return url.replace('.html', '_pf.html')
def print_version(self, url): def get_article_url(self, article):
return url.replace('.html', '_pf.html') m = self.r.match(article.get('description', None))
print m.group('url')
return m.group('url')