calibre/recipes/independent.recipe
Kovid Goyal 80c5dd6ac3 ...
2016-09-16 10:21:48 +05:30

159 lines
6.2 KiB
Plaintext

from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class TheIndependentNew(BasicNewsRecipe):
title = u'The Independent'
__author__ = 'Krittika Goyal'
description = 'The latest in UK News and World News from The \
Independent. Wide range of international and local news, sports \
news, commentary and opinion pieces.Independent News - Breaking news \
that matters. Your daily comprehensive news source - The \
Independent Newspaper'
publisher = 'The Independent'
oldest_article = 2.0
ignore_duplicate_articles = {'title', 'url'}
remove_empty_feeds = True
category = 'news, UK'
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
language = 'en_GB'
publication_type = 'newspaper'
encoding = 'utf-8'
compress_news_images = True
keep_only_tags = [
dict(itemprop=['articleBody', 'headline', 'contentUrl']),
dict(attrs={'class': ['intro', 'author']}),
]
remove_tags = [
dict(attrs={'class': lambda x: x and 'show-all' in x.split()}),
dict(attrs={'class': lambda x: x and 'context-sdl_editor_representation' in x.split()}),
dict(attrs={'data-scald-gallery': True}),
]
remove_attributes = ['style']
def preprocess_html(self, soup):
for div in soup.findAll(attrs={'class': 'full-gallery'}):
imgs = {}
for li in div.findAll('li', attrs={'data-gallery-item': True, 'data-original': True}):
imgs[li['data-gallery-item']] = li['data-original']
li.extract()
for li in div.findAll('li', attrs={'data-gallery-legend': True}):
src = imgs.get(li['data-gallery-legend'])
if src is not None:
img = Tag(soup, 'img')
img['src'] = src
img['style'] = 'display:block'
li.append(img)
return soup
feeds = [
(u'News - UK',
u'http://www.independent.co.uk/news/uk/rss'),
(u'News - World',
u'http://www.independent.co.uk/news/world/rss'),
(u'News - Business',
u'http://www.independent.co.uk/news/business/rss'),
(u'News - People',
u'http://www.independent.co.uk/news/people/rss'),
(u'News - Science',
u'http://www.independent.co.uk/news/science/rss'),
(u'News - Media',
u'http://www.independent.co.uk/news/media/rss'),
(u'News - Education',
u'http://www.independent.co.uk/news/education/rss'),
(u'News - Obituaries',
u'http://www.independent.co.uk/news/obituaries/rss'),
(u'News - Corrections',
u'http://www.independent.co.uk/news/corrections/rss'
),
(u'Voices',
u'http://www.independent.co.uk/voices/rss'
),
(u'Environment',
u'http://www.independent.co.uk/environment/rss'),
(u'Sport - Athletics',
u'http://www.independent.co.uk/sport/general/athletics/rss'
),
(u'Sport - Cricket',
u'http://www.independent.co.uk/sport/cricket/rss'),
(u'Sport - Football',
u'http://www.independent.co.uk/sport/football/rss'),
(u'Sport - Golf',
u'http://www.independent.co.uk/sport/golf/rss'),
(u'Sport - Motor racing',
u'http://www.independent.co.uk/sport/motor-racing/rss'
),
(u'Sport - Olympics',
u'http://www.independent.co.uk/sport/olympics/rss'),
(u'Sport - Racing',
u'http://www.independent.co.uk/sport/racing/rss'),
(u'Sport - Rugby League',
u'http://www.independent.co.uk/sport/general/rugby-league/rss'),
(u'Sport - Rugby Union',
u'http://www.independent.co.uk/sport/rugby/rugby-union/rss'
),
(u'Sport - Sailing',
u'http://www.independent.co.uk/sport/general/sailing/rss'
),
(u'Sport - Tennis',
u'http://www.independent.co.uk/sport/tennis/rss'),
(u'Sport - Others',
u'http://www.independent.co.uk/sport/general/others/rss'
),
(u'Life & Style - Fashion',
u'http://www.independent.co.uk/life-style/fashion/rss'
),
(u'Life & Style -Food & Drink',
u'http://www.independent.co.uk/life-style/food-and-drink/rss'
),
(u'Life & Style - Health and Families',
u'http://www.independent.co.uk/life-style/health-and-families/rss'
),
(u'Life & Style - History',
u'http://www.independent.co.uk/life-style/history/rss'
),
(u'Life & Style - Gadgets & Tech',
u'http://www.independent.co.uk/life-style/gadgets-and-tech/rss'
),
(u'Life & Style - Motoring',
u'http://www.independent.co.uk/life-style/motoring/rss'
),
(u'Arts & Ents - Art',
u'http://www.independent.co.uk/arts-entertainment/art/rss'
),
(u'Arts & Ents - Architecture',
u'http://www.independent.co.uk/arts-entertainment/architecture/rss'
),
(u'Arts & Ents - Music',
u'http://www.independent.co.uk/arts-entertainment/music/rss'
),
(u'Arts & Ents - Classical',
u'http://www.independent.co.uk/arts-entertainment/classical/rss'
),
(u'Arts & Ents - Films',
u'http://www.independent.co.uk/arts-entertainment/films/rss'
),
(u'Arts & Ents - TV',
u'http://www.independent.co.uk/arts-entertainment/tv/rss'
),
(u'Arts & Ents - Theatre and Dance',
u'http://www.independent.co.uk/arts-entertainment/theatre-dance/rss'
),
(u'Arts & Ents - Comedy',
u'http://www.independent.co.uk/arts-entertainment/comedy/rss'
),
(u'Arts & Ents - Books',
u'http://www.independent.co.uk/arts-entertainment/books/rss'
),
(u'Travel', u'http://www.independent.co.uk/travel/rss'
),
(u'Money', u'http://www.independent.co.uk/money/rss'),
(u'IndyBest',
u'http://www.independent.co.uk/extras/indybest/rss'),
]