From eb03273848bd0b553f055d5b6c39983eb09aeab8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 1 Jul 2015 11:03:48 +0530 Subject: [PATCH] Update Spectator Magazine --- recipes/spectator_magazine.recipe | 78 ++++++++++++++----------------- 1 file changed, 36 insertions(+), 42 deletions(-) diff --git a/recipes/spectator_magazine.recipe b/recipes/spectator_magazine.recipe index eb61a8babd..f4b0b3b0c4 100644 --- a/recipes/spectator_magazine.recipe +++ b/recipes/spectator_magazine.recipe @@ -1,60 +1,54 @@ from calibre.web.feeds.recipes import BasicNewsRecipe -class NYTimes(BasicNewsRecipe): +def class_sel(cls): + def f(x): + return x and cls in x.split() + return f + +class Spectator(BasicNewsRecipe): title = 'Spectator Magazine' - __author__ = 'Krittika Goyal' + __author__ = 'Kovid Goyal' description = 'Magazine' - timefmt = ' [%d %b, %Y]' - needs_subscription = False language = 'en' no_stylesheets = True - #auto_cleanup = True - #auto_cleanup_keep = '//div[@class="thumbnail"]' - keep_only_tags = dict(name='div', attrs={'id':'content'}) + keep_only_tags = dict(name='div', attrs={'class':['article-head', 'article-image', 'article-body']}) remove_tags = [ - dict(name='div', attrs={'id':['disqus_thread']}), - ##dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}), - ##dict(name='form', attrs={'onsubmit':''}), - #dict(name='section', attrs={'id':['article-quote', 'article-navigation']}), + dict(name='div', attrs={'id':['disqus_thread']}), + dict(attrs={'class':['middle-promo']}), ] - #TO GET ARTICLE TOC - def spec_get_index(self): - return self.index_to_soup('http://www.spectator.co.uk/') + def parse_spec_section(self, div): + h2 = div.find('h2') + sectitle = self.tag_to_string(h2) + self.log('Section:', sectitle) + articles = [] + for div in div.findAll('div', id=lambda x: x and x.startswith('post-')): + h2 = div.find('h2', attrs={'class':class_sel('post-title')}) + title = self.tag_to_string(h2) + a = h2.find('a') + url = a['href'] + desc = '' + self.log('\tArticle:', title) + p = div.find('p') + if p is not None: + desc = self.tag_to_string(p) + articles.append({'title':title, 'url':url, 'description':desc}) + return sectitle, articles - # To parse artice toc def parse_index(self): - parse_soup = self.index_to_soup('http://www.spectator.co.uk/') + soup = self.index_to_soup('http://www.spectator.co.uk/magazine/') + a = soup.find('a', attrs={'class':'magazine-issue-wrap'}) + self.timefmt = a['title'] + self.cover_url = a['href'] feeds = [] - feed_title = 'Spectator Magazine Articles' - - articles = [] - self.log('Found section:', feed_title) - div = parse_soup.find(attrs={'class':'one-col-tax-widget magazine-list columns-1 post-8 taxonomy-category full-width widget section-widget icit-taxonomical-listings'}) - for art in div.findAll(name='h2'): - art_info = art.find(name = 'a') - if art_info is None: - continue - art_title = self.tag_to_string(art_info) - url = art_info.get('href') - self.log.info('\tFound article:', art_title, 'at', url) - article = {'title':art_title, 'url':url, 'date':''} - #au = art.find(attrs={'class':'articleAuthors'}) - #if au is not None: - #article['author'] = self.tag_to_string(au) - #desc = art.find(attrs={'class':'hover_text'}) - #if desc is not None: - #desc = self.tag_to_string(desc) - #if 'author' in article: - #desc = ' by ' + article['author'] + ' ' +desc - #article['description'] = desc - articles.append(article) - if articles: - feeds.append((feed_title, articles)) + div = soup.find(id='magazine-full') + for x in div.findAll(attrs={'class':class_sel('magazine-section-holder')}): + title, articles = self.parse_spec_section(x) + if articles: + feeds.append((title, articles)) return feeds -