__license__ = 'GPL v3' __copyright__ = '2011, Darko Miletic ' ''' www.independent.co.uk ''' from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup class TheIndependent(BasicNewsRecipe): title = 'The Independent' __author__ = 'Darko Miletic' description = 'Independent News - Breaking news, comment and features from The Independent newspaper' publisher = 'The Independent' category = 'news, politics, UK' oldest_article = 2 max_articles_per_feed = 200 no_stylesheets = True encoding = 'cp1252' use_embedded_content = False language = 'en_GB' remove_empty_feeds = True publication_type = 'newspaper' masthead_url = 'http://www.independent.co.uk/independent.co.uk/images/logo-london.png' extra_css = """ h1{font-family: Georgia,serif } body{font-family: Verdana,Arial,Helvetica,sans-serif} img{margin-bottom: 0.4em; display:block} .info,.caption,.credits{font-size: x-small} """ conversion_options = { 'comment' : description , 'tags' : category , 'publisher' : publisher , 'language' : language } remove_tags =[ dict(name=['meta','link','object','embed','iframe','base','style']) ,dict(attrs={'class':['related-articles','share','googleCols','article-tools','paging','googleArt']}) ,dict(attrs={'id':['newsVideoPlayer','yahoobook','google-intext']}) ] keep_only_tags =[dict(attrs={'id':'article'})] remove_attributes=['lang','onclick','width','xmlns:fb'] feeds = [ (u'UK' , u'http://www.independent.co.uk/news/uk/rss' ) ,(u'World' , u'http://www.independent.co.uk/news/world/rss' ) ,(u'Business' , u'http://www.independent.co.uk/news/business/rss' ) ,(u'People' , u'http://www.independent.co.uk/news/people/rss' ) ,(u'Science' , u'http://www.independent.co.uk/news/science/rss' ) ,(u'Media' , u'http://www.independent.co.uk/news/media/rss' ) ,(u'Education' , u'http://www.independent.co.uk/news/education/rss' ) ,(u'Leading Articles' , u'http://www.independent.co.uk/opinion/leading-articles/rss') ,(u'Comentators' , u'http://www.independent.co.uk/opinion/commentators/rss' ) ,(u'Columnists' , u'http://www.independent.co.uk/opinion/columnists/rss' ) ,(u'Letters' , u'http://www.independent.co.uk/opinion/letters/rss' ) ,(u'Big Question' , u'http://www.independent.co.uk/extras/big-question/rss' ) ,(u'Sport' , u'http://www.independent.co.uk/sport/rss' ) ,(u'Life&Style' , u'http://www.independent.co.uk/life-style/rss' ) ,(u'Arts&Entertainment' , u'http://www.independent.co.uk/arts-entertainment/rss' ) ,(u'Travel' , u'http://www.independent.co.uk/travel/rss' ) ,(u'Money' , u'http://www.independent.co.uk/money/rss' ) ] def get_article_url(self, article): return article.get('guid', None) def preprocess_html(self, soup): for item in soup.body.findAll(style=True): del item['style'] for item in soup.body.findAll(['author','preform']): item.name='span' for item in soup.body.findAll('img'): if not item.has_key('alt'): item['alt'] = 'image' for item in soup.body.findAll('div', attrs={'class':['clear-o','body','photoCaption']}): item.name = 'p' for item in soup.body.findAll('div'): if not item.attrs and not item.contents: item.extract() soup2 = BeautifulSoup('t') soup2.body.replaceWith(soup.body) return soup2