diff --git a/recipes/independent.recipe b/recipes/independent.recipe index 8bf87c2d9d..707ab3edd8 100644 --- a/recipes/independent.recipe +++ b/recipes/independent.recipe @@ -1,86 +1,119 @@ -__license__ = 'GPL v3' -__copyright__ = '2011, Darko Miletic ' -''' -www.independent.co.uk -''' - from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup -class TheIndependent(BasicNewsRecipe): - title = 'The Independent' - __author__ = 'Darko Miletic' - description = 'Independent News - Breaking news, comment and features from The Independent newspaper' - publisher = 'The Independent' - category = 'news, politics, UK' - oldest_article = 2 - max_articles_per_feed = 200 - no_stylesheets = True - encoding = 'cp1252' - use_embedded_content = False - language = 'en_GB' - remove_empty_feeds = True - publication_type = 'newspaper' - masthead_url = 'http://www.independent.co.uk/independent.co.uk/images/logo-london.png' - extra_css = """ - h1{font-family: Georgia,serif } - body{font-family: Verdana,Arial,Helvetica,sans-serif} - img{margin-bottom: 0.4em; display:block} - .info,.caption,.credits{font-size: x-small} - """ - - conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } - - remove_tags =[ - dict(name=['meta','link','object','embed','iframe','base','style']) - ,dict(attrs={'class':['related-articles','share','googleCols','article-tools','paging','googleArt']}) - ,dict(attrs={'id':['newsVideoPlayer','yahoobook','google-intext']}) - ] - keep_only_tags =[dict(attrs={'id':'article'})] - remove_attributes=['lang','onclick','width','xmlns:fb'] +class AdvancedUserRecipe1320474488(BasicNewsRecipe): + title = u'The Independent' + oldest_article = 1 + max_articles_per_feed = 100 + auto_cleanup = True + language = 'en_GB' + __author__ = 'NotTaken' feeds = [ - (u'UK' , u'http://www.independent.co.uk/news/uk/rss' ) - ,(u'World' , u'http://www.independent.co.uk/news/world/rss' ) - ,(u'Business' , u'http://www.independent.co.uk/news/business/rss' ) - ,(u'People' , u'http://www.independent.co.uk/news/people/rss' ) - ,(u'Science' , u'http://www.independent.co.uk/news/science/rss' ) - ,(u'Media' , u'http://www.independent.co.uk/news/media/rss' ) - ,(u'Education' , u'http://www.independent.co.uk/news/education/rss' ) - ,(u'Leading Articles' , u'http://www.independent.co.uk/opinion/leading-articles/rss') - ,(u'Comentators' , u'http://www.independent.co.uk/opinion/commentators/rss' ) - ,(u'Columnists' , u'http://www.independent.co.uk/opinion/columnists/rss' ) - ,(u'Letters' , u'http://www.independent.co.uk/opinion/letters/rss' ) - ,(u'Big Question' , u'http://www.independent.co.uk/extras/big-question/rss' ) - ,(u'Sport' , u'http://www.independent.co.uk/sport/rss' ) - ,(u'Life&Style' , u'http://www.independent.co.uk/life-style/rss' ) - ,(u'Arts&Entertainment' , u'http://www.independent.co.uk/arts-entertainment/rss' ) - ,(u'Travel' , u'http://www.independent.co.uk/travel/rss' ) - ,(u'Money' , u'http://www.independent.co.uk/money/rss' ) - ] + (u'News - UK', + u'http://www.independent.co.uk/news/uk/?service=rss'), + (u'News - World', + u'http://www.independent.co.uk/news/world/?service=rss'), + (u'News - Business', + u'http://www.independent.co.uk/news/business/?service=rss'), + (u'News - People', + u'http://www.independent.co.uk/news/people/?service=rss'), + (u'News - Science', + u'http://www.independent.co.uk/news/science/?service=rss'), + (u'News - Media', + u'http://www.independent.co.uk/news/media/?service=rss'), + (u'News - Education', + u'http://www.independent.co.uk/news/education/?service=rss'), + (u'News - Obituaries', + u'http://rss.feedsportal.com/c/266/f/3531/index.rss'), + (u'News - Corrections', + u'http://www.independent.co.uk/news/corrections/?service=rss' + ), + (u'Opinion', + u'http://www.independent.co.uk/opinion/?service=rss'), + (u'Environment', + u'http://www.independent.co.uk/environment/?service=rss'), + (u'Sport - Athletics', + u'http://www.independent.co.uk/sport/general/athletics/?service=rss' + ), + (u'Sport - Cricket', + u'http://www.independent.co.uk/sport/cricket/?service=rss'), + (u'Sport - Football', + u'http://www.independent.co.uk/sport/football/?service=rss'), + (u'Sport - Golf', + u'http://www.independent.co.uk/sport/golf/?service=rss'), + (u'Sport - Motor racing', + u'http://www.independent.co.uk/sport/motor-racing/?service=rss' + ), + (u'Sport - Olympics', + u'http://rss.feedsportal.com/c/266/f/3800/index.rss'), + (u'Sport - Racing', + u'http://www.independent.co.uk/sport/racing/?service=rss'), + (u'Sport - Rugby League', + u'http://rss.feedsportal.com/c/266/f/3795/index.rss'), + (u'Sport - Rugby Union', + u'http://www.independent.co.uk/sport/rugby/rugby-union/?service=rss' + ), + (u'Sport - Sailing', + u'http://www.independent.co.uk/sport/general/sailing/?service=rss' + ), + (u'Sport - Tennis', + u'http://www.independent.co.uk/sport/tennis/?service=rss'), + (u'Sport - Others', + u'http://www.independent.co.uk/sport/general/others/?service=rss' + ), + (u'Life & Style - Fashion', + u'http://www.independent.co.uk/life-style/fashion/?service=rss' + ), + (u'Life & Style -Food & Drink', + u'http://www.independent.co.uk/life-style/food-and-drink/?service=rss' + ), + (u'Life & Style - Health and Families', + u'http://www.independent.co.uk/life-style/health-and-families/?service=rss' + ), + (u'Life & Style - House & Home', + u'http://www.independent.co.uk/life-style/house-and-home/'), + (u'Life & Style - History', + u'http://www.independent.co.uk/life-style/history/?service=rss' + ), + (u'Life & Style - Gadgets & Tech', + u'http://www.independent.co.uk/life-style/gadgets-and-tech/?service=rss' + ), + (u'Life & Style - Motoring', + u'http://www.independent.co.uk/life-style/motoring/?service=rss' + ), + (u'Arts & Ents - Art', + u'http://www.independent.co.uk/arts-entertainment/art/?service=rss' + ), + (u'Arts & Ents - Architecture', + u'http://www.independent.co.uk/arts-entertainment/architecture/?service=rss' + ), + (u'Arts & Ents - Music', + u'http://www.independent.co.uk/arts-entertainment/music/?service=rss' + ), + (u'Arts & Ents - Classical', + u'http://www.independent.co.uk/arts-entertainment/classical/?service=rss' + ), + (u'Arts & Ents - Films', + u'http://www.independent.co.uk/arts-entertainment/films/?service=rss' + ), + (u'Arts & Ents - TV', + u'http://www.independent.co.uk/arts-entertainment/tv/?service=rss' + ), + (u'Arts & Ents - Theatre and Dance', + u'http://www.independent.co.uk/arts-entertainment/theatre-dance/?service=rss' + ), + (u'Arts & Ents - Comedy', + u'http://www.independent.co.uk/arts-entertainment/comedy/?service=rss' + ), + (u'Arts & Ents - Books', + u'http://www.independent.co.uk/arts-entertainment/books/?service=rss' + ), + (u'Travel', u'http://www.independent.co.uk/travel/?service=rss' + ), + (u'Money', u'http://www.independent.co.uk/money/?service=rss'), + (u'IndyBest', + u'http://www.independent.co.uk/extras/indybest/?service=rss'), + (u'Blogs', u'http://blogs.independent.co.uk/feed/rss/'), + ] - def get_article_url(self, article): - return article.get('guid', None) - - def preprocess_html(self, soup): - for item in soup.body.findAll(style=True): - del item['style'] - for item in soup.body.findAll(['author','preform']): - item.name='span' - for item in soup.body.findAll('img'): - if not item.has_key('alt'): - item['alt'] = 'image' - for item in soup.body.findAll('div', attrs={'class':['clear-o','body','photoCaption']}): - item.name = 'p' - for item in soup.body.findAll('div'): - if not item.attrs and not item.contents: - item.extract() - soup2 = BeautifulSoup('t') - soup2.body.replaceWith(soup.body) - return soup2