mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-31 02:27:01 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			202 lines
		
	
	
		
			7.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			202 lines
		
	
	
		
			7.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python
 | |
| # vim:fileencoding=utf-8
 | |
| from calibre import random_user_agent
 | |
| from calibre.ebooks.BeautifulSoup import Tag
 | |
| from calibre.web.feeds.recipes import BasicNewsRecipe
 | |
| 
 | |
| 
 | |
| def classes(classes):
 | |
|     q = frozenset(classes.split(' '))
 | |
|     return dict(attrs={
 | |
|         'class': lambda x: x and frozenset(x.split()).intersection(q)})
 | |
| 
 | |
| 
 | |
| def new_tag(soup, name, attrs=()):
 | |
|     impl = getattr(soup, 'new_tag', None)
 | |
|     if impl is not None:
 | |
|         return impl(name, attrs=dict(attrs))
 | |
|     return Tag(soup, name, attrs=attrs or None)
 | |
| 
 | |
| 
 | |
| class TheIndependentNew(BasicNewsRecipe):
 | |
| 
 | |
|     title = u'The Independent'
 | |
|     __author__ = 'Krittika Goyal'
 | |
|     description             = ('The latest in UK News and World News from The '
 | |
|                                'Independent. Wide range of international and local news, sports '
 | |
|                                'news, commentary and opinion pieces.Independent News - Breaking news '
 | |
|                                'that matters. Your daily comprehensive news source - The '
 | |
|                                'Independent Newspaper')
 | |
|     publisher = 'The Independent'
 | |
|     oldest_article = 2.0
 | |
|     ignore_duplicate_articles = {'title', 'url'}
 | |
|     category = 'news, UK'
 | |
|     no_stylesheets = True
 | |
|     use_embedded_content = False
 | |
|     remove_empty_feeds = True
 | |
|     language = 'en_GB'
 | |
|     publication_type = 'newspaper'
 | |
|     encoding = 'utf-8'
 | |
|     compress_news_images = True
 | |
| 
 | |
|     recipe_specific_options = {
 | |
|         'days': {
 | |
|             'short': 'Oldest article to download from this news source. In days ',
 | |
|             'long': 'For example, 0.5, gives you articles from the past 12 hours',
 | |
|             'default': str(oldest_article)
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     def __init__(self, *args, **kwargs):
 | |
|         BasicNewsRecipe.__init__(self, *args, **kwargs)
 | |
|         d = self.recipe_specific_options.get('days')
 | |
|         if d and isinstance(d, str):
 | |
|             self.oldest_article = float(d)
 | |
| 
 | |
|     keep_only_tags = [
 | |
|         dict(id=['articleHeader', 'main']),
 | |
|         classes('headline sub-headline breadcrumb author publish-date hero-image body-content'),
 | |
|     ]
 | |
|     remove_tags = [
 | |
|         classes('inline-related inline-readmore ad-wrapper icon-gallery i-gallery article-im-prompt apester-media'
 | |
|                 ' social-share i-amphtml-replaced-content gallery-btn related'),
 | |
|         dict(name='amp-live-list'),
 | |
|     ]
 | |
|     remove_attributes = ['style']
 | |
| 
 | |
|     def get_browser(self, *a, **kw):
 | |
|         # This site returns images in JPEG-XR format if the user agent is IE
 | |
|         if not hasattr(self, 'non_ie_ua'):
 | |
|             try:
 | |
|                 self.non_ie_ua = random_user_agent(allow_ie=False)
 | |
|             except TypeError:
 | |
|                 self.non_ie_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.111 Safari/537.36'
 | |
|         kw['user_agent'] = self.non_ie_ua
 | |
|         br = BasicNewsRecipe.get_browser(self, *a, **kw)
 | |
|         return br
 | |
| 
 | |
|     def preprocess_html(self, soup):
 | |
|         for img in soup.findAll('amp-img'):
 | |
|             img.name = 'img'
 | |
|             img['srcset'] = ''
 | |
| 
 | |
|         for div in soup.findAll(attrs={'class': 'full-gallery'}):
 | |
|             imgs = {}
 | |
|             for li in div.findAll('li', attrs={'data-gallery-item': True, 'data-original': True}):
 | |
|                 imgs[li['data-gallery-item']] = li['data-original']
 | |
|                 li.extract()
 | |
|             for li in div.findAll('li', attrs={'data-gallery-legend': True}):
 | |
|                 src = imgs.get(li['data-gallery-legend'])
 | |
|                 if src is not None:
 | |
|                     img = new_tag(soup, 'img')
 | |
|                     img['src'] = src
 | |
|                     img['style'] = 'display:block'
 | |
|                     li.append(img)
 | |
|         return soup
 | |
| 
 | |
|     feeds = [
 | |
|         (u'News - UK',
 | |
|          u'http://www.independent.co.uk/news/uk/rss'),
 | |
|         (u'News - World',
 | |
|          u'http://www.independent.co.uk/news/world/rss'),
 | |
|         (u'News - Business',
 | |
|          u'http://www.independent.co.uk/news/business/rss'),
 | |
|         (u'News - People',
 | |
|          u'http://www.independent.co.uk/news/people/rss'),
 | |
|         (u'News - Science',
 | |
|          u'http://www.independent.co.uk/news/science/rss'),
 | |
|         (u'News - Media',
 | |
|          u'http://www.independent.co.uk/news/media/rss'),
 | |
|         (u'News - Education',
 | |
|          u'http://www.independent.co.uk/news/education/rss'),
 | |
|         (u'News - Obituaries',
 | |
|          u'http://www.independent.co.uk/news/obituaries/rss'),
 | |
|         (u'News - Corrections',
 | |
|          u'http://www.independent.co.uk/news/corrections/rss'
 | |
|          ),
 | |
|         (u'Voices',
 | |
|          u'http://www.independent.co.uk/voices/rss'
 | |
|          ),
 | |
|         (u'Environment',
 | |
|          u'http://www.independent.co.uk/environment/rss'),
 | |
|         (u'Sport - Athletics',
 | |
|          u'http://www.independent.co.uk/sport/general/athletics/rss'
 | |
|          ),
 | |
|         (u'Sport - Cricket',
 | |
|          u'http://www.independent.co.uk/sport/cricket/rss'),
 | |
|         (u'Sport - Football',
 | |
|          u'http://www.independent.co.uk/sport/football/rss'),
 | |
|         (u'Sport - Golf',
 | |
|          u'http://www.independent.co.uk/sport/golf/rss'),
 | |
|         (u'Sport - Motor racing',
 | |
|          u'http://www.independent.co.uk/sport/motor-racing/rss'
 | |
|          ),
 | |
|         (u'Sport - Olympics',
 | |
|          u'http://www.independent.co.uk/sport/olympics/rss'),
 | |
|         (u'Sport - Racing',
 | |
|          u'http://www.independent.co.uk/sport/racing/rss'),
 | |
|         (u'Sport - Rugby League',
 | |
|          u'http://www.independent.co.uk/sport/general/rugby-league/rss'),
 | |
|         (u'Sport - Rugby Union',
 | |
|          u'http://www.independent.co.uk/sport/rugby/rugby-union/rss'
 | |
|          ),
 | |
|         (u'Sport - Sailing',
 | |
|          u'http://www.independent.co.uk/sport/general/sailing/rss'
 | |
|          ),
 | |
|         (u'Sport - Tennis',
 | |
|          u'http://www.independent.co.uk/sport/tennis/rss'),
 | |
|         (u'Sport - Others',
 | |
|          u'http://www.independent.co.uk/sport/general/others/rss'
 | |
|          ),
 | |
|         (u'Life & Style - Fashion',
 | |
|          u'http://www.independent.co.uk/life-style/fashion/rss'
 | |
|          ),
 | |
|         (u'Life & Style -Food & Drink',
 | |
|          u'http://www.independent.co.uk/life-style/food-and-drink/rss'
 | |
|          ),
 | |
|         (u'Life & Style - Health and Families',
 | |
|          u'http://www.independent.co.uk/life-style/health-and-families/rss'
 | |
|          ),
 | |
|         (u'Life & Style - History',
 | |
|          u'http://www.independent.co.uk/life-style/history/rss'
 | |
|          ),
 | |
|         (u'Life & Style - Gadgets & Tech',
 | |
|          u'http://www.independent.co.uk/life-style/gadgets-and-tech/rss'
 | |
|          ),
 | |
|         (u'Life & Style - Motoring',
 | |
|          u'http://www.independent.co.uk/life-style/motoring/rss'
 | |
|          ),
 | |
|         (u'Arts & Ents - Art',
 | |
|          u'http://www.independent.co.uk/arts-entertainment/art/rss'
 | |
|          ),
 | |
|         (u'Arts & Ents - Architecture',
 | |
|          u'http://www.independent.co.uk/arts-entertainment/architecture/rss'
 | |
|          ),
 | |
|         (u'Arts & Ents - Music',
 | |
|          u'http://www.independent.co.uk/arts-entertainment/music/rss'
 | |
|          ),
 | |
|         (u'Arts & Ents - Classical',
 | |
|          u'http://www.independent.co.uk/arts-entertainment/classical/rss'
 | |
|          ),
 | |
|         (u'Arts & Ents - Films',
 | |
|          u'http://www.independent.co.uk/arts-entertainment/films/rss'
 | |
|          ),
 | |
|         (u'Arts & Ents - TV',
 | |
|          u'http://www.independent.co.uk/arts-entertainment/tv/rss'
 | |
|          ),
 | |
|         (u'Arts & Ents - Theatre and Dance',
 | |
|          u'http://www.independent.co.uk/arts-entertainment/theatre-dance/rss'
 | |
|          ),
 | |
|         (u'Arts & Ents - Comedy',
 | |
|          u'http://www.independent.co.uk/arts-entertainment/comedy/rss'
 | |
|          ),
 | |
|         (u'Arts & Ents - Books',
 | |
|          u'http://www.independent.co.uk/arts-entertainment/books/rss'
 | |
|          ),
 | |
|         (u'Travel', u'http://www.independent.co.uk/travel/rss'
 | |
|          ),
 | |
|         (u'Money', u'http://www.independent.co.uk/money/rss'),
 | |
|         (u'IndyBest',
 | |
|          u'http://www.independent.co.uk/extras/indybest/rss'),
 | |
|     ]
 |