mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
FB2 Metadata Reader: Fix cover extraction, author_sort extraction, support multiple authors. Improved Irish Times recipe
This commit is contained in:
parent
49ab346619
commit
c30fb96ebf
@ -5,50 +5,82 @@ __copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
|
||||
|
||||
'''Read meta information from fb2 files'''
|
||||
|
||||
import mimetypes
|
||||
import mimetypes, os
|
||||
from base64 import b64decode
|
||||
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
from lxml import etree
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
|
||||
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||
def XLINK(name):
|
||||
return '{%s}%s' % (XLINK_NS, name)
|
||||
|
||||
|
||||
def get_metadata(stream):
|
||||
""" Return metadata as a L{MetaInfo} object """
|
||||
soup = BeautifulStoneSoup(stream.read())
|
||||
firstname = soup.find("first-name").contents[0]
|
||||
lastname = soup.find("last-name").contents[0]
|
||||
author= [firstname+" "+lastname]
|
||||
title = soup.find("book-title").string
|
||||
comments = soup.find("annotation")
|
||||
tags = soup.findAll('genre')
|
||||
tags = [t.contents[0] for t in tags]
|
||||
cp = soup.find('coverpage')
|
||||
XPath = lambda x : etree.XPath(x,
|
||||
namespaces={'fb2':'http://www.gribuser.ru/xml/fictionbook/2.0',
|
||||
'xlink':XLINK_NS})
|
||||
tostring = lambda x : etree.tostring(x, method='text',
|
||||
encoding=unicode).strip()
|
||||
root = etree.fromstring(stream.read())
|
||||
authors, author_sort = [], None
|
||||
for au in XPath('//fb2:author')(root):
|
||||
fname = lname = author = None
|
||||
fe = XPath('descendant::fb2:first-name')(au)
|
||||
if fe:
|
||||
fname = tostring(fe[0])
|
||||
author = fname
|
||||
le = XPath('descendant::fb2:last-name')(au)
|
||||
if le:
|
||||
lname = tostring(le[0])
|
||||
author += ' '+lname
|
||||
if author:
|
||||
authors.append(author)
|
||||
if len(authors) == 1 and author is not None:
|
||||
if lname:
|
||||
author_sort = lname
|
||||
if fname:
|
||||
if author_sort: author_sort += ', '+fname
|
||||
else: author_sort = fname
|
||||
title = os.path.splitext(os.path.basename(getattr(stream, 'name',
|
||||
_('Unknown'))))[0]
|
||||
for x in XPath('//fb2:book-title')(root):
|
||||
title = tostring(x)
|
||||
comments = ''
|
||||
for x in XPath('//fb2:annotation')(root):
|
||||
comments += tostring(x)
|
||||
if not comments:
|
||||
comments = None
|
||||
tags = list(map(tostring, XPath('//fb2:genre')(root)))
|
||||
|
||||
cp = XPath('//fb2:coverpage')(root)
|
||||
cdata = None
|
||||
if cp:
|
||||
cimage = cp.find('image', attrs={'l:href':True})
|
||||
cimage = XPath('descendant::fb2:image[@xlink:href]')(cp[0])
|
||||
if cimage:
|
||||
id = cimage['l:href'].replace('#', '')
|
||||
binary = soup.find('binary', id=id, attrs={'content-type':True})
|
||||
id = cimage[0].get(XLINK('href')).replace('#', '')
|
||||
binary = XPath('//fb2:binary[@id="%s"]'%id)(root)
|
||||
if binary:
|
||||
mt = binary['content-type']
|
||||
mt = binary[0].get('content-type', 'image/jpeg')
|
||||
exts = mimetypes.guess_all_extensions(mt)
|
||||
if not exts:
|
||||
exts = ['.jpg']
|
||||
cdata = (exts[0][1:], b64decode(binary.string.strip()))
|
||||
cdata = (exts[0][1:], b64decode(tostring(binary[0])))
|
||||
|
||||
if comments:
|
||||
comments = u''.join(comments.findAll(text=True))
|
||||
series = soup.find("sequence")
|
||||
mi = MetaInformation(title, author)
|
||||
series = None
|
||||
series_index = 1.0
|
||||
for x in XPath('//fb2:sequence')(root):
|
||||
series = x.get('name', None)
|
||||
if series is not None:
|
||||
series_index = x.get('number', 1.0)
|
||||
break
|
||||
mi = MetaInformation(title, authors)
|
||||
mi.comments = comments
|
||||
mi.author_sort = lastname+'; '+firstname
|
||||
mi.author_sort = author_sort
|
||||
if tags:
|
||||
mi.tags = tags
|
||||
if series:
|
||||
mi.series = series.get('name', None)
|
||||
try:
|
||||
mi.series_index = float(series.get('number', None))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
mi.series = series
|
||||
mi.series_index = series_index
|
||||
if cdata:
|
||||
mi.cover_data = cdata
|
||||
return mi
|
||||
|
@ -1,37 +1,39 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Derry FitzGerald'
|
||||
'''
|
||||
irishtimes.com
|
||||
'''
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class IrishTimes(BasicNewsRecipe):
|
||||
title = u'The Irish Times'
|
||||
__author__ = 'Derry FitzGerald'
|
||||
language = _('English')
|
||||
no_stylesheets = True
|
||||
|
||||
remove_tags = [dict(name='div', attrs={'class':'footer'})]
|
||||
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
|
||||
|
||||
feeds = [
|
||||
('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'),
|
||||
('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'),
|
||||
('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'),
|
||||
('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'),
|
||||
('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'),
|
||||
('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
|
||||
('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
|
||||
('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
|
||||
('Health', 'http://www.irishtimes.com/feeds/rss/newspaper/health.rss'),
|
||||
('Education and Parenting', 'http://www.irishtimes.com/feeds/rss/newspaper/education.rss'),
|
||||
('Science Today', 'http://www.irishtimes.com/feeds/rss/newspaper/sciencetoday.rss'),
|
||||
('The Ticket', 'http://www.irishtimes.com/feeds/rss/newspaper/theticket.rss'),
|
||||
('Weekend', 'http://www.irishtimes.com/feeds/rss/newspaper/weekend.rss'),
|
||||
('News Features', 'http://www.irishtimes.com/feeds/rss/newspaper/newsfeatures.rss'),
|
||||
('Magazine', 'http://www.irishtimes.com/feeds/rss/newspaper/magazine.rss'),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('.html', '_pf.html')
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Derry FitzGerald. 2009 Modified by Ray Kinsella'
|
||||
'''
|
||||
irishtimes.com
|
||||
'''
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class IrishTimes(BasicNewsRecipe):
|
||||
title = u'The Irish Times'
|
||||
__author__ = 'Derry FitzGerald and Ray Kinsella'
|
||||
language = _('English')
|
||||
no_stylesheets = True
|
||||
simultaneous_downloads= 1
|
||||
|
||||
r = re.compile('.*(?P<url>http:\/\/www.irishtimes.com\/.*\.html).*')
|
||||
remove_tags = [dict(name='div', attrs={'class':'footer'})]
|
||||
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
|
||||
|
||||
feeds = [
|
||||
('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'),
|
||||
('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'),
|
||||
('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'),
|
||||
('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'),
|
||||
('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'),
|
||||
('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
|
||||
('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
|
||||
('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
|
||||
]
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
return url.replace('.html', '_pf.html')
|
||||
|
||||
def get_article_url(self, article):
|
||||
m = self.r.match(article.get('description', None))
|
||||
print m.group('url')
|
||||
return m.group('url')
|
Loading…
x
Reference in New Issue
Block a user