from __future__ import print_function from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class HuffingtonPostRecipe(BasicNewsRecipe): __license__ = 'GPL v3' __author__ = 'kwetal and Archana Raman' language = 'en' version = 2 title = u'The Huffington Post' publisher = u'huffingtonpost.com' category = u'News, Politics' description = u'Political Blog' oldest_article = 1.1 max_articles_per_feed = 100 encoding = 'utf-8' remove_empty_feeds = True no_stylesheets = True remove_javascript = True keep_only_tags = [ classes('entry__header entry__body') ] remove_tags = [ classes('app-download-interstitial share-bar top-media--video advertisement extra-content' ' below-entry entry-inline-subscription-module related-articles') ] # Feeds from: http://www.huffingtonpost.com/syndication/ feeds = [] feeds.append( (u'Politics', u'http://www.huffingtonpost.com/feeds/verticals/politics/index.xml')) feeds.append( (u'Media', u'http://www.huffingtonpost.com/feeds/verticals/media/index.xml')) feeds.append( (u'Business', u'http://www.huffingtonpost.com/feeds/verticals/business/index.xml')) feeds.append( (u'Entertainment', u'http://www.huffingtonpost.com/feeds/verticals/entertainment/index.xml')) feeds.append( (u'Living', u'http://www.huffingtonpost.com/feeds/verticals/living/index.xml')) feeds.append( (u'Style', u'http://www.huffingtonpost.com/feeds/verticals/style/index.xml')) feeds.append( (u'Green', u'http://www.huffingtonpost.com/feeds/verticals/green/index.xml')) feeds.append( (u'Technology', u'http://www.huffingtonpost.com/feeds/verticals/technology/index.xml')) feeds.append( (u'Comedy', u'http://www.huffingtonpost.com/feeds/verticals/comedy/index.xml')) feeds.append( (u'World', u'http://www.huffingtonpost.com/feeds/verticals/world/index.xml')) feeds.append((u'Original Reporting', u'http://www.huffingtonpost.com/tag/huffpolitics/feed')) extra_css = ''' h1{font-family :Arial,Helvetica,sans-serif; font-size:large;} h2{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#000000;} h3{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#000000;} body{font-family:verdana,arial,helvetica,geneva,sans-serif ;} #title_permalink{color:black;font-size:large;} .date{color:#858585;font-family:"Times New Roman",sans-serif;} .comments_datetime v05{color:#696969;} .teaser_permalink{font-style:italic;font-size:xx-small;} .blog_posted_date{color:#696969;font-size:xx-small;font-weight: bold;} ''' # a[href]{color: blue; text-decoration: none; cursor: pointer;} def get_article_url(self, article): """ Workaround for Feedparser behaviour. If an item has more than one element, article.link is empty and article.links contains a list of dictionaries. Todo: refactor to searching this list to avoid the hardcoded zero-index """ link = article.get('link') print("Link:" + link) if not link: links = article.get('links') if links: link = links[0]['href'] if not links[0]['href']: link = links[1]['href'] return link def postprocess_html(self, soup, first_fetch): for tag in soup.findAll('div', text="What's Your Reaction?"): tag.extract() for tg in soup.findAll('blockquote'): tg.extract() return soup