diff --git a/recipes/esquire.recipe b/recipes/esquire.recipe index f559f5a6cf..026a0d39d1 100644 --- a/recipes/esquire.recipe +++ b/recipes/esquire.recipe @@ -1,47 +1,80 @@ __license__ = 'GPL v3' -__copyright__ = '2009-2010, Darko Miletic ' ''' www.esquire.com ''' +from collections import defaultdict from calibre.web.feeds.news import BasicNewsRecipe +from css_selectors import Select class Esquire(BasicNewsRecipe): title = 'Esquire' - __author__ = 'Darko Miletic' + __author__ = 'Kovid Goyal' description = 'Esquire: Man at His Best' publisher = 'Hearst Communications, Inc.' - category = 'magazine, men, women we love, style, the guide, sex, screen' - oldest_article = 30 - max_articles_per_feed = 100 no_stylesheets = True - encoding = 'cp1250' - use_embedded_content = False + encoding = 'utf-8' language = 'en' - publication_type = 'magazine' - masthead_url = 'http://www.esquire.com/cm/shared/site_images/print_this/esquire_logo.gif' - conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } + keep_only_tags = [ + dict(name='header', attrs={'class':['gallery-header', 'article-header']}), + dict(attrs={'class':['gallery-main-view', 'article-body--content']}), + ] - keep_only_tags = [dict(name='div', attrs={'id':['article_header','article_content']})] - remove_tags = [dict(name=['object','link','embed','iframe','base'])] - remove_attributes = ['width','height'] - - feeds = [ - (u'Style' , u'http://www.esquire.com/style/rss/' ) - ,(u'Women' , u'http://www.esquire.com/women/rss/' ) - ,(u'Features' , u'http://www.esquire.com/features/rss/' ) - ,(u'Fiction' , u'http://www.esquire.com/fiction/rss/' ) - ,(u'Frontpage', u'http://www.esquire.com/rss/' ) - ] + remove_tags = [ + dict(attrs={'class':'article-body--share-container'}), + dict(attrs={'class':lambda x: x and 'tags--top' in x}), + dict(attrs={'class':lambda x: x and 'image-share' in x}), + dict(attrs={'class':lambda x: x and 'share-gallery' in x}), + dict(attrs={'class':lambda x: x and 'embedded-image--expand' in x}), + dict(attrs={'class':lambda x: x and 'embedded-image--close' in x}), + ] def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] + for img in soup.findAll('img', attrs={'data-src':True}): + img['src'] = img['data-src'] return soup + + def parse_index(self): + url = 'http://www.esquire.com' + root = self.index_to_soup(url, as_tree=True) + select = Select(root) + feeds = defaultdict(list) + + for a in select('.custom-promo--title a[href]'): + title = self.tag_to_string(a).strip() + url = a.get('href') + if url.startswith('/'): + url = 'http://www.esquire.com' + url + feeds['Cover Story'] = [{'title':title, 'url':url}] + break + + for story in select('.landing-feed--story-container'): + for sec in select('.landing-feed--story-section-name', story): + section = self.tag_to_string(sec).strip() + break + else: + continue + articles = feeds[section] + for a in select('a.landing-feed--story-title[href]', story): + title = self.tag_to_string(a).strip() + url = a.get('href') + if url.startswith('/'): + url = 'http://www.esquire.com' + url + break + else: + continue + for div in select('.landing-feed--story-abstract', story): + desc = self.tag_to_string(div).strip() + break + else: + desc = '' + articles.append({'title':title, 'url':url, 'description':desc}) + + ans = [] + for sec in sorted(feeds, key=lambda x:(x != 'Cover Story', x)): + articles = feeds[sec] + if articles: + ans.append((sec, articles)) + return ans