__license__ = 'GPL v3' __copyright__ = '2010-2011, Darko Miletic ' ''' smh.com.au ''' from calibre.web.feeds.news import BasicNewsRecipe class Smh_au(BasicNewsRecipe): title = 'The Sydney Morning Herald' __author__ = 'Darko Miletic' description = 'Breaking news from Sydney, Australia and the world. Features the latest business, sport, entertainment, travel, lifestyle, and technology news.' # noqa publisher = 'Fairfax Digital' category = 'news, politics, Australia, Sydney' oldest_article = 2 max_articles_per_feed = 200 no_stylesheets = True ignore_duplicate_articles = {'title', 'url'} use_embedded_content = False encoding = 'utf-8' use_embedded_content = False language = 'en_AU' remove_empty_feeds = True masthead_url = 'http://images.smh.com.au/2010/02/02/1087188/smh-620.jpg' publication_type = 'newspaper' extra_css = """ h1{font-family: Georgia,"Times New Roman",Times,serif } body{font-family: Arial,Helvetica,sans-serif} .cT-imageLandscape,.cT-imagePortrait{font-size: x-small} """ feeds = [ ('Politics', 'http://www.smh.com.au/rssheadlines/political-news/article/rss.xml'), ('NSW News', 'http://www.smh.com.au/rssheadlines/nsw/article/rss.xml'), ('World', 'http://www.smh.com.au/rssheadlines/world/article/rss.xml'), ('National', 'http://www.smh.com.au/rssheadlines/national/article/rss.xml'), ('Business', 'http://www.smh.com.au/rssheadlines/business/article/rss.xml'), ('Entertainment', 'http://feeds.smh.com.au/rssheadlines/entertainment.xml'), ('Technology', 'http://www.smh.com.au/rssheadlines/technology-news/article/rss.xml'), ('Health', 'http://www.smh.com.au/rssheadlines/health/article/rss.xml'), ('Sport', 'http://feeds.smh.com.au/rssheadlines/sport.xml'), ('Cricket', 'http://www.smh.com.au/rssheadlines/cricket/article/rss.xml'), ] remove_tags_after = [ dict(name='div', attrs={'class': 'articleBody'}) ] keep_only_tags = [ dict(name='div', attrs={'id': 'content'}) ] remove_tags = [ dict(name='div', attrs={ 'id': ['googleAds', 'moreGoogleAds', 'comments', 'video-player-content']}), dict(name='div', attrs={'class': 'cT-imageMultimedia'}), dict(name=['object', 'embed', 'iframe']), dict(attrs={'class': 'hidden'}), dict(name=['link', 'meta', 'base', 'embed', 'object', 'iframe']) ] remove_attributes = ['width', 'height', 'lang'] def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll('bod'): item.name = 'div' return soup