mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
FB2 Metadata Reader: Fix cover extraction, author_sort extraction, support multiple authors. Improved Irish Times recipe
This commit is contained in:
parent
49ab346619
commit
c30fb96ebf
@ -5,50 +5,82 @@ __copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
|
|||||||
|
|
||||||
'''Read meta information from fb2 files'''
|
'''Read meta information from fb2 files'''
|
||||||
|
|
||||||
import mimetypes
|
import mimetypes, os
|
||||||
from base64 import b64decode
|
from base64 import b64decode
|
||||||
|
from lxml import etree
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
|
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
|
|
||||||
|
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||||
|
def XLINK(name):
|
||||||
|
return '{%s}%s' % (XLINK_NS, name)
|
||||||
|
|
||||||
|
|
||||||
def get_metadata(stream):
|
def get_metadata(stream):
|
||||||
""" Return metadata as a L{MetaInfo} object """
|
""" Return metadata as a L{MetaInfo} object """
|
||||||
soup = BeautifulStoneSoup(stream.read())
|
XPath = lambda x : etree.XPath(x,
|
||||||
firstname = soup.find("first-name").contents[0]
|
namespaces={'fb2':'http://www.gribuser.ru/xml/fictionbook/2.0',
|
||||||
lastname = soup.find("last-name").contents[0]
|
'xlink':XLINK_NS})
|
||||||
author= [firstname+" "+lastname]
|
tostring = lambda x : etree.tostring(x, method='text',
|
||||||
title = soup.find("book-title").string
|
encoding=unicode).strip()
|
||||||
comments = soup.find("annotation")
|
root = etree.fromstring(stream.read())
|
||||||
tags = soup.findAll('genre')
|
authors, author_sort = [], None
|
||||||
tags = [t.contents[0] for t in tags]
|
for au in XPath('//fb2:author')(root):
|
||||||
cp = soup.find('coverpage')
|
fname = lname = author = None
|
||||||
|
fe = XPath('descendant::fb2:first-name')(au)
|
||||||
|
if fe:
|
||||||
|
fname = tostring(fe[0])
|
||||||
|
author = fname
|
||||||
|
le = XPath('descendant::fb2:last-name')(au)
|
||||||
|
if le:
|
||||||
|
lname = tostring(le[0])
|
||||||
|
author += ' '+lname
|
||||||
|
if author:
|
||||||
|
authors.append(author)
|
||||||
|
if len(authors) == 1 and author is not None:
|
||||||
|
if lname:
|
||||||
|
author_sort = lname
|
||||||
|
if fname:
|
||||||
|
if author_sort: author_sort += ', '+fname
|
||||||
|
else: author_sort = fname
|
||||||
|
title = os.path.splitext(os.path.basename(getattr(stream, 'name',
|
||||||
|
_('Unknown'))))[0]
|
||||||
|
for x in XPath('//fb2:book-title')(root):
|
||||||
|
title = tostring(x)
|
||||||
|
comments = ''
|
||||||
|
for x in XPath('//fb2:annotation')(root):
|
||||||
|
comments += tostring(x)
|
||||||
|
if not comments:
|
||||||
|
comments = None
|
||||||
|
tags = list(map(tostring, XPath('//fb2:genre')(root)))
|
||||||
|
|
||||||
|
cp = XPath('//fb2:coverpage')(root)
|
||||||
cdata = None
|
cdata = None
|
||||||
if cp:
|
if cp:
|
||||||
cimage = cp.find('image', attrs={'l:href':True})
|
cimage = XPath('descendant::fb2:image[@xlink:href]')(cp[0])
|
||||||
if cimage:
|
if cimage:
|
||||||
id = cimage['l:href'].replace('#', '')
|
id = cimage[0].get(XLINK('href')).replace('#', '')
|
||||||
binary = soup.find('binary', id=id, attrs={'content-type':True})
|
binary = XPath('//fb2:binary[@id="%s"]'%id)(root)
|
||||||
if binary:
|
if binary:
|
||||||
mt = binary['content-type']
|
mt = binary[0].get('content-type', 'image/jpeg')
|
||||||
exts = mimetypes.guess_all_extensions(mt)
|
exts = mimetypes.guess_all_extensions(mt)
|
||||||
if not exts:
|
if not exts:
|
||||||
exts = ['.jpg']
|
exts = ['.jpg']
|
||||||
cdata = (exts[0][1:], b64decode(binary.string.strip()))
|
cdata = (exts[0][1:], b64decode(tostring(binary[0])))
|
||||||
|
|
||||||
if comments:
|
series = None
|
||||||
comments = u''.join(comments.findAll(text=True))
|
series_index = 1.0
|
||||||
series = soup.find("sequence")
|
for x in XPath('//fb2:sequence')(root):
|
||||||
mi = MetaInformation(title, author)
|
series = x.get('name', None)
|
||||||
|
if series is not None:
|
||||||
|
series_index = x.get('number', 1.0)
|
||||||
|
break
|
||||||
|
mi = MetaInformation(title, authors)
|
||||||
mi.comments = comments
|
mi.comments = comments
|
||||||
mi.author_sort = lastname+'; '+firstname
|
mi.author_sort = author_sort
|
||||||
if tags:
|
if tags:
|
||||||
mi.tags = tags
|
mi.tags = tags
|
||||||
if series:
|
mi.series = series
|
||||||
mi.series = series.get('name', None)
|
mi.series_index = series_index
|
||||||
try:
|
|
||||||
mi.series_index = float(series.get('number', None))
|
|
||||||
except (TypeError, ValueError):
|
|
||||||
pass
|
|
||||||
if cdata:
|
if cdata:
|
||||||
mi.cover_data = cdata
|
mi.cover_data = cdata
|
||||||
return mi
|
return mi
|
||||||
|
@ -1,37 +1,39 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Derry FitzGerald'
|
__copyright__ = '2008, Derry FitzGerald. 2009 Modified by Ray Kinsella'
|
||||||
'''
|
'''
|
||||||
irishtimes.com
|
irishtimes.com
|
||||||
'''
|
'''
|
||||||
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
class IrishTimes(BasicNewsRecipe):
|
|
||||||
title = u'The Irish Times'
|
class IrishTimes(BasicNewsRecipe):
|
||||||
__author__ = 'Derry FitzGerald'
|
title = u'The Irish Times'
|
||||||
language = _('English')
|
__author__ = 'Derry FitzGerald and Ray Kinsella'
|
||||||
no_stylesheets = True
|
language = _('English')
|
||||||
|
no_stylesheets = True
|
||||||
remove_tags = [dict(name='div', attrs={'class':'footer'})]
|
simultaneous_downloads= 1
|
||||||
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
|
|
||||||
|
r = re.compile('.*(?P<url>http:\/\/www.irishtimes.com\/.*\.html).*')
|
||||||
feeds = [
|
remove_tags = [dict(name='div', attrs={'class':'footer'})]
|
||||||
('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'),
|
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
|
||||||
('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'),
|
|
||||||
('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'),
|
feeds = [
|
||||||
('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'),
|
('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'),
|
||||||
('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'),
|
('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'),
|
||||||
('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
|
('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'),
|
||||||
('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
|
('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'),
|
||||||
('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
|
('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'),
|
||||||
('Health', 'http://www.irishtimes.com/feeds/rss/newspaper/health.rss'),
|
('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
|
||||||
('Education and Parenting', 'http://www.irishtimes.com/feeds/rss/newspaper/education.rss'),
|
('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
|
||||||
('Science Today', 'http://www.irishtimes.com/feeds/rss/newspaper/sciencetoday.rss'),
|
('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
|
||||||
('The Ticket', 'http://www.irishtimes.com/feeds/rss/newspaper/theticket.rss'),
|
]
|
||||||
('Weekend', 'http://www.irishtimes.com/feeds/rss/newspaper/weekend.rss'),
|
|
||||||
('News Features', 'http://www.irishtimes.com/feeds/rss/newspaper/newsfeatures.rss'),
|
|
||||||
('Magazine', 'http://www.irishtimes.com/feeds/rss/newspaper/magazine.rss'),
|
def print_version(self, url):
|
||||||
]
|
return url.replace('.html', '_pf.html')
|
||||||
|
|
||||||
def print_version(self, url):
|
def get_article_url(self, article):
|
||||||
return url.replace('.html', '_pf.html')
|
m = self.r.match(article.get('description', None))
|
||||||
|
print m.group('url')
|
||||||
|
return m.group('url')
|
Loading…
x
Reference in New Issue
Block a user