diff --git a/recipes/icons/independent.png b/recipes/icons/independent.png new file mode 100644 index 0000000000..c3c9c39b69 Binary files /dev/null and b/recipes/icons/independent.png differ diff --git a/recipes/independent.recipe b/recipes/independent.recipe index 0a94384b37..aa39dd3c23 100644 --- a/recipes/independent.recipe +++ b/recipes/independent.recipe @@ -1,70 +1,86 @@ +__license__ = 'GPL v3' +__copyright__ = '2011, Darko Miletic ' +''' +www.independent.co.uk +''' + from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class TheIndependent(BasicNewsRecipe): - title = u'The Independent' - language = 'en_GB' - __author__ = 'Krittika Goyal' - oldest_article = 1 #days - max_articles_per_feed = 30 - encoding = 'latin1' + title = 'The Independent' + __author__ = 'Darko Miletic' + description = 'Independent News - Breaking news, comment and features from The Independent newspaper' + publisher = 'The Independent' + category = 'news, politics, UK' + oldest_article = 2 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'cp1252' + use_embedded_content = False + language = 'en_GB' + remove_empty_feeds = True + publication_type = 'newspaper' + masthead_url = 'http://www.independent.co.uk/independent.co.uk/images/logo-london.png' + extra_css = """ + h1{font-family: Georgia,serif } + body{font-family: Verdana,Arial,Helvetica,sans-serif} + img{margin-bottom: 0.4em; display:block} + .info,.caption,.credits{font-size: x-small} + """ - no_stylesheets = True - #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) - #remove_tags_after = dict(name='td', attrs={'class':'newptool1'}) - remove_tags = [ - dict(name='iframe'), - dict(name='div', attrs={'class':'related-articles'}), - dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}), - dict(name='ul', attrs={'class':'article-tools'}), - dict(name='ul', attrs={'class':'articleTools'}), - ] + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } - feeds = [ - ('UK', - 'http://www.independent.co.uk/news/uk/rss'), - ('World', - 'http://www.independent.co.uk/news/world/rss'), - ('Business', - 'http://www.independent.co.uk/news/business/rss'), - ('People', - 'http://www.independent.co.uk/news/people/rss'), - ('Science', - 'http://www.independent.co.uk/news/science/rss'), - ('Media', - 'http://www.independent.co.uk/news/media/rss'), - ('Education', - 'http://www.independent.co.uk/news/education/rss'), - ('Obituaries', - 'http://www.independent.co.uk/news/obituaries/rss'), + remove_tags =[ + dict(name=['meta','link','object','embed','iframe','base','style']) + ,dict(attrs={'class':['related-articles','share','googleCols','article-tools','paging','googleArt']}) + ,dict(attrs={'id':['newsVideoPlayer','yahoobook','google-intext']}) + ] + keep_only_tags =[dict(attrs={'id':'article'})] + remove_attributes=['lang','onclick','width','xmlns:fb'] - ('Opinion', - 'http://www.independent.co.uk/opinion/rss'), - ('Environment', - 'http://www.independent.co.uk/environment/rss'), - - ('Sport', - 'http://www.independent.co.uk/sport/rss'), - - ('Life and Style', - 'http://www.independent.co.uk/life-style/rss'), - - ('Arts and Entertainment', - 'http://www.independent.co.uk/arts-entertainment/rss'), - - ('Travel', - 'http://www.independent.co.uk/travel/rss'), - - ('Money', - 'http://www.independent.co.uk/money/rss'), - ] + feeds = [ + (u'UK' , u'http://www.independent.co.uk/news/uk/rss' ) + ,(u'World' , u'http://www.independent.co.uk/news/world/rss' ) + ,(u'Business' , u'http://www.independent.co.uk/news/business/rss' ) + ,(u'People' , u'http://www.independent.co.uk/news/people/rss' ) + ,(u'Science' , u'http://www.independent.co.uk/news/science/rss' ) + ,(u'Media' , u'http://www.independent.co.uk/news/media/rss' ) + ,(u'Education' , u'http://www.independent.co.uk/news/education/rss' ) + ,(u'Leading Articles' , u'http://www.independent.co.uk/opinion/leading-articles/rss') + ,(u'Comentators' , u'http://www.independent.co.uk/opinion/commentators/rss' ) + ,(u'Columnists' , u'http://www.independent.co.uk/opinion/columnists/rss' ) + ,(u'Letters' , u'http://www.independent.co.uk/opinion/letters/rss' ) + ,(u'Big Question' , u'http://www.independent.co.uk/extras/big-question/rss' ) + ,(u'Sport' , u'http://www.independent.co.uk/sport/rss' ) + ,(u'Life&Style' , u'http://www.independent.co.uk/life-style/rss' ) + ,(u'Arts&Entertainment' , u'http://www.independent.co.uk/arts-entertainment/rss' ) + ,(u'Travel' , u'http://www.independent.co.uk/travel/rss' ) + ,(u'Money' , u'http://www.independent.co.uk/money/rss' ) + ] + def get_article_url(self, article): + return article.get('guid', None) + def preprocess_html(self, soup): - story = soup.find(name='div', attrs={'id':'mainColumn'}) - #td = heading.findParent(name='td') - #td.extract() - soup = BeautifulSoup('t') - body = soup.find(name='body') - body.insert(0, story) - return soup + for item in soup.body.findAll(style=True): + del item['style'] + for item in soup.body.findAll(['author','preform']): + item.name='span' + for item in soup.body.findAll('img'): + if not item.has_key('alt'): + item['alt'] = 'image' + for item in soup.body.findAll('div', attrs={'class':['clear-o','body','photoCaption']}): + item.name = 'p' + for item in soup.body.findAll('div'): + if not item.attrs and not item.contents: + item.extract() + soup2 = BeautifulSoup('t') + soup2.body.replaceWith(soup.body) + return soup2