#!/usr/bin/env python2 # vim:fileencoding=utf-8 from calibre import random_user_agent from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) def new_tag(soup, name, attrs=()): impl = getattr(soup, 'new_tag', None) if impl is not None: return impl(name, attrs=dict(attrs)) return Tag(soup, name, attrs=attrs or None) class TheIndependentNew(BasicNewsRecipe): title = u'The Independent' __author__ = 'Krittika Goyal' description = 'The latest in UK News and World News from The \ Independent. Wide range of international and local news, sports \ news, commentary and opinion pieces.Independent News - Breaking news \ that matters. Your daily comprehensive news source - The \ Independent Newspaper' publisher = 'The Independent' oldest_article = 2.0 ignore_duplicate_articles = {'title', 'url'} remove_empty_feeds = True category = 'news, UK' no_stylesheets = True use_embedded_content = False remove_empty_feeds = True language = 'en_GB' publication_type = 'newspaper' encoding = 'utf-8' compress_news_images = True keep_only_tags = [ classes('headline sub-headline breadcrumb author publish-date hero-image body-content'), ] remove_tags = [ classes('inline-related inline-readmore ad-wrapper icon-gallery i-gallery') ] remove_attributes = ['style'] def get_browser(self, *a, **kw): # This site returns images in JPEG-XR format if the user agent is IE if not hasattr(self, 'non_ie_ua'): try: self.non_ie_ua = random_user_agent(allow_ie=False) except TypeError: self.non_ie_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36' kw['user_agent'] = self.non_ie_ua br = BasicNewsRecipe.get_browser(self, *a, **kw) return br def preprocess_html(self, soup): for img in soup.findAll('amp-img'): img.name = 'img' img['srcset'] = '' for div in soup.findAll(attrs={'class': 'full-gallery'}): imgs = {} for li in div.findAll('li', attrs={'data-gallery-item': True, 'data-original': True}): imgs[li['data-gallery-item']] = li['data-original'] li.extract() for li in div.findAll('li', attrs={'data-gallery-legend': True}): src = imgs.get(li['data-gallery-legend']) if src is not None: img = new_tag(soup, 'img') img['src'] = src img['style'] = 'display:block' li.append(img) return soup feeds = [ (u'News - UK', u'http://www.independent.co.uk/news/uk/rss'), (u'News - World', u'http://www.independent.co.uk/news/world/rss'), (u'News - Business', u'http://www.independent.co.uk/news/business/rss'), (u'News - People', u'http://www.independent.co.uk/news/people/rss'), (u'News - Science', u'http://www.independent.co.uk/news/science/rss'), (u'News - Media', u'http://www.independent.co.uk/news/media/rss'), (u'News - Education', u'http://www.independent.co.uk/news/education/rss'), (u'News - Obituaries', u'http://www.independent.co.uk/news/obituaries/rss'), (u'News - Corrections', u'http://www.independent.co.uk/news/corrections/rss' ), (u'Voices', u'http://www.independent.co.uk/voices/rss' ), (u'Environment', u'http://www.independent.co.uk/environment/rss'), (u'Sport - Athletics', u'http://www.independent.co.uk/sport/general/athletics/rss' ), (u'Sport - Cricket', u'http://www.independent.co.uk/sport/cricket/rss'), (u'Sport - Football', u'http://www.independent.co.uk/sport/football/rss'), (u'Sport - Golf', u'http://www.independent.co.uk/sport/golf/rss'), (u'Sport - Motor racing', u'http://www.independent.co.uk/sport/motor-racing/rss' ), (u'Sport - Olympics', u'http://www.independent.co.uk/sport/olympics/rss'), (u'Sport - Racing', u'http://www.independent.co.uk/sport/racing/rss'), (u'Sport - Rugby League', u'http://www.independent.co.uk/sport/general/rugby-league/rss'), (u'Sport - Rugby Union', u'http://www.independent.co.uk/sport/rugby/rugby-union/rss' ), (u'Sport - Sailing', u'http://www.independent.co.uk/sport/general/sailing/rss' ), (u'Sport - Tennis', u'http://www.independent.co.uk/sport/tennis/rss'), (u'Sport - Others', u'http://www.independent.co.uk/sport/general/others/rss' ), (u'Life & Style - Fashion', u'http://www.independent.co.uk/life-style/fashion/rss' ), (u'Life & Style -Food & Drink', u'http://www.independent.co.uk/life-style/food-and-drink/rss' ), (u'Life & Style - Health and Families', u'http://www.independent.co.uk/life-style/health-and-families/rss' ), (u'Life & Style - History', u'http://www.independent.co.uk/life-style/history/rss' ), (u'Life & Style - Gadgets & Tech', u'http://www.independent.co.uk/life-style/gadgets-and-tech/rss' ), (u'Life & Style - Motoring', u'http://www.independent.co.uk/life-style/motoring/rss' ), (u'Arts & Ents - Art', u'http://www.independent.co.uk/arts-entertainment/art/rss' ), (u'Arts & Ents - Architecture', u'http://www.independent.co.uk/arts-entertainment/architecture/rss' ), (u'Arts & Ents - Music', u'http://www.independent.co.uk/arts-entertainment/music/rss' ), (u'Arts & Ents - Classical', u'http://www.independent.co.uk/arts-entertainment/classical/rss' ), (u'Arts & Ents - Films', u'http://www.independent.co.uk/arts-entertainment/films/rss' ), (u'Arts & Ents - TV', u'http://www.independent.co.uk/arts-entertainment/tv/rss' ), (u'Arts & Ents - Theatre and Dance', u'http://www.independent.co.uk/arts-entertainment/theatre-dance/rss' ), (u'Arts & Ents - Comedy', u'http://www.independent.co.uk/arts-entertainment/comedy/rss' ), (u'Arts & Ents - Books', u'http://www.independent.co.uk/arts-entertainment/books/rss' ), (u'Travel', u'http://www.independent.co.uk/travel/rss' ), (u'Money', u'http://www.independent.co.uk/money/rss'), (u'IndyBest', u'http://www.independent.co.uk/extras/indybest/rss'), ]