diff --git a/src/calibre/ebooks/metadata/fb2.py b/src/calibre/ebooks/metadata/fb2.py index eded2dc056..03e3117d3e 100644 --- a/src/calibre/ebooks/metadata/fb2.py +++ b/src/calibre/ebooks/metadata/fb2.py @@ -5,50 +5,82 @@ __copyright__ = '2008, Anatoly Shipitsin ' '''Read meta information from fb2 files''' -import mimetypes +import mimetypes, os from base64 import b64decode - -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup +from lxml import etree from calibre.ebooks.metadata import MetaInformation +XLINK_NS = 'http://www.w3.org/1999/xlink' +def XLINK(name): + return '{%s}%s' % (XLINK_NS, name) + + def get_metadata(stream): """ Return metadata as a L{MetaInfo} object """ - soup = BeautifulStoneSoup(stream.read()) - firstname = soup.find("first-name").contents[0] - lastname = soup.find("last-name").contents[0] - author= [firstname+" "+lastname] - title = soup.find("book-title").string - comments = soup.find("annotation") - tags = soup.findAll('genre') - tags = [t.contents[0] for t in tags] - cp = soup.find('coverpage') + XPath = lambda x : etree.XPath(x, + namespaces={'fb2':'http://www.gribuser.ru/xml/fictionbook/2.0', + 'xlink':XLINK_NS}) + tostring = lambda x : etree.tostring(x, method='text', + encoding=unicode).strip() + root = etree.fromstring(stream.read()) + authors, author_sort = [], None + for au in XPath('//fb2:author')(root): + fname = lname = author = None + fe = XPath('descendant::fb2:first-name')(au) + if fe: + fname = tostring(fe[0]) + author = fname + le = XPath('descendant::fb2:last-name')(au) + if le: + lname = tostring(le[0]) + author += ' '+lname + if author: + authors.append(author) + if len(authors) == 1 and author is not None: + if lname: + author_sort = lname + if fname: + if author_sort: author_sort += ', '+fname + else: author_sort = fname + title = os.path.splitext(os.path.basename(getattr(stream, 'name', + _('Unknown'))))[0] + for x in XPath('//fb2:book-title')(root): + title = tostring(x) + comments = '' + for x in XPath('//fb2:annotation')(root): + comments += tostring(x) + if not comments: + comments = None + tags = list(map(tostring, XPath('//fb2:genre')(root))) + + cp = XPath('//fb2:coverpage')(root) cdata = None if cp: - cimage = cp.find('image', attrs={'l:href':True}) + cimage = XPath('descendant::fb2:image[@xlink:href]')(cp[0]) if cimage: - id = cimage['l:href'].replace('#', '') - binary = soup.find('binary', id=id, attrs={'content-type':True}) + id = cimage[0].get(XLINK('href')).replace('#', '') + binary = XPath('//fb2:binary[@id="%s"]'%id)(root) if binary: - mt = binary['content-type'] + mt = binary[0].get('content-type', 'image/jpeg') exts = mimetypes.guess_all_extensions(mt) if not exts: exts = ['.jpg'] - cdata = (exts[0][1:], b64decode(binary.string.strip())) + cdata = (exts[0][1:], b64decode(tostring(binary[0]))) - if comments: - comments = u''.join(comments.findAll(text=True)) - series = soup.find("sequence") - mi = MetaInformation(title, author) + series = None + series_index = 1.0 + for x in XPath('//fb2:sequence')(root): + series = x.get('name', None) + if series is not None: + series_index = x.get('number', 1.0) + break + mi = MetaInformation(title, authors) mi.comments = comments - mi.author_sort = lastname+'; '+firstname + mi.author_sort = author_sort if tags: mi.tags = tags - if series: - mi.series = series.get('name', None) - try: - mi.series_index = float(series.get('number', None)) - except (TypeError, ValueError): - pass + mi.series = series + mi.series_index = series_index if cdata: mi.cover_data = cdata return mi diff --git a/src/calibre/web/feeds/recipes/recipe_irish_times.py b/src/calibre/web/feeds/recipes/recipe_irish_times.py index 089e8d974c..807af6baf5 100644 --- a/src/calibre/web/feeds/recipes/recipe_irish_times.py +++ b/src/calibre/web/feeds/recipes/recipe_irish_times.py @@ -1,37 +1,39 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Derry FitzGerald' -''' -irishtimes.com -''' - -from calibre.web.feeds.news import BasicNewsRecipe - -class IrishTimes(BasicNewsRecipe): - title = u'The Irish Times' - __author__ = 'Derry FitzGerald' - language = _('English') - no_stylesheets = True - - remove_tags = [dict(name='div', attrs={'class':'footer'})] - extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }' - - feeds = [ - ('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'), - ('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'), - ('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'), - ('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'), - ('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'), - ('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'), - ('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'), - ('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'), - ('Health', 'http://www.irishtimes.com/feeds/rss/newspaper/health.rss'), - ('Education and Parenting', 'http://www.irishtimes.com/feeds/rss/newspaper/education.rss'), - ('Science Today', 'http://www.irishtimes.com/feeds/rss/newspaper/sciencetoday.rss'), - ('The Ticket', 'http://www.irishtimes.com/feeds/rss/newspaper/theticket.rss'), - ('Weekend', 'http://www.irishtimes.com/feeds/rss/newspaper/weekend.rss'), - ('News Features', 'http://www.irishtimes.com/feeds/rss/newspaper/newsfeatures.rss'), - ('Magazine', 'http://www.irishtimes.com/feeds/rss/newspaper/magazine.rss'), - ] - - def print_version(self, url): - return url.replace('.html', '_pf.html') \ No newline at end of file +__license__ = 'GPL v3' +__copyright__ = '2008, Derry FitzGerald. 2009 Modified by Ray Kinsella' +''' +irishtimes.com +''' +import re + +from calibre.web.feeds.news import BasicNewsRecipe + +class IrishTimes(BasicNewsRecipe): + title = u'The Irish Times' + __author__ = 'Derry FitzGerald and Ray Kinsella' + language = _('English') + no_stylesheets = True + simultaneous_downloads= 1 + + r = re.compile('.*(?Phttp:\/\/www.irishtimes.com\/.*\.html).*') + remove_tags = [dict(name='div', attrs={'class':'footer'})] + extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }' + + feeds = [ + ('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'), + ('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'), + ('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'), + ('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'), + ('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'), + ('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'), + ('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'), + ('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'), + ] + + + def print_version(self, url): + return url.replace('.html', '_pf.html') + + def get_article_url(self, article): + m = self.r.match(article.get('description', None)) + print m.group('url') + return m.group('url') \ No newline at end of file