diff --git a/resources/images/news/smh.png b/resources/images/news/smh.png new file mode 100644 index 0000000000..9c814be286 Binary files /dev/null and b/resources/images/news/smh.png differ diff --git a/resources/recipes/smh.recipe b/resources/recipes/smh.recipe index 1a6f35a272..023baaddda 100644 --- a/resources/recipes/smh.recipe +++ b/resources/recipes/smh.recipe @@ -1,91 +1,66 @@ -#!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - +__copyright__ = '2010, Darko Miletic ' ''' smh.com.au ''' +from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe +class Smh_au(BasicNewsRecipe): + title = 'The Sydney Morning Herald - Printed edition' + __author__ = 'Darko Miletic' + description = 'Breaking news from Sydney, Australia and the world. Features the latest business, sport, entertainment, travel, lifestyle, and technology news.' + publisher = 'Fairfax Digital' + category = 'news, politics, Australia, Sydney' + oldest_article = 2 + max_articles_per_feed = 200 + no_stylesheets = True + encoding = 'utf-8' + use_embedded_content = False + language = 'en_AU' + remove_empty_feeds = True + masthead_url = 'http://images.smh.com.au/2010/02/02/1087188/smh-620.jpg' + publication_type = 'newspaper' + extra_css = ' h1{font-family: Georgia,"Times New Roman",Times,serif } body{font-family: Arial,Helvetica,sans-serif} .cT-imageLandscape{font-size: x-small} ' -class SMH(BasicNewsRecipe): - - title = 'Sydney Morning Herald' - description = 'Business News, World News and Breaking News in Australia' - __author__ = 'Kovid Goyal and Sujata Raman' - language = 'en_AU' - - max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - no_javascript = True - - timefmt = ' [%A, %d %B, %Y]' - encoding = 'utf-8' - - keep_only_tags = [dict(name='div', attrs ={'id':'content'})] - remove_tags = [ - dict(name='div', attrs={'align' :'right'}), - dict(name='p', attrs={'class' :'comments'}), - dict(name='a', attrs={'class' :['more-photos','performerpromo']}), - dict(name='img', attrs={'alt' :'aap'}), - dict(name='div', attrs ={'id':['googleAds','moreGoogleAds','comments','footer','sidebar','austereopuff','adSpotIsland']}), - dict(name='div', attrs ={'class':['article-links','wof','articleTools top','cN-multimediaGroup cfix','articleTools bottom']}), - dict(name='div', attrs ={'class':['clear','adSpot-textboxgr1','adSpot-textBox','articleTools-c3 cfix','articleExtras-bottom','span-16 last']}), - dict(name='div', attrs ={'class':[ 'sidebar span-5','cT-socialCommenting','cN-linkList','cN-topicSelector','cT-storyTools cfix','cT-imageMultimedia']}) , - dict(name='iframe'), - ] - - extra_css = ''' - h1{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large;} - .cT-storyDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;} - .articleBody{font-family:Arial,Helvetica,sans-serif; color:black;font-size:small;} - .cT-imageLandscape{font-family:Arial,Helvetica,sans-serif; color:#333333 ;font-size:x-small;} - .source{font-family:Arial,Helvetica,sans-serif; color:#333333 ;font-size:xx-small;} - #content{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} - .pageprint{font-family:Arial,Helvetica,sans-serif;font-size:small;} - #bylineDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;} - .featurePic-wide{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} - #idfeaturepic{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} - h3{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;} - h2{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;} - h4{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;} - h5{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;} - body{font-family:Arial,Helvetica,sans-serif; font-size:x-small;} - ''' - - - feeds = [ - ('Top Stories', 'http://feeds.smh.com.au/rssheadlines/top.xml'), - ('National', 'http://feeds.smh.com.au/rssheadlines/national.xml'), - ('World', 'http://feeds.smh.com.au/rssheadlines/world.xml'), - ('Business', 'http://www.smh.com.au/rssheadlines/business.xml'), - ('National Times', 'http://www.smh.com.au/rssheadlines/opinion/article/rss.xml'), - ('Entertainment', 'http://feeds.smh.com.au/rssheadlines/entertainment.xml'), - ('Technology', 'http://feeds.smh.com.au/rssheadlines/technology.xml'), - ('Sport', 'http://feeds.smh.com.au/rssheadlines/sport.xml'), - ] - - def preprocess_html(self, soup): - bod = soup.find('bod') - if bod is not None: - bod.tag = 'div' - p = soup.find(id='content') - bod.extract() - p.insert(len(p), bod) - return soup - - def get_browser(self): - br = BasicNewsRecipe.get_browser() - br.set_handle_refresh(False) - return br - - def get_article_url(self, article): - url = article.link - if 'media' in url: - url = '' - return url + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + remove_tags = [ + dict(name='div', attrs={'id':['googleAds','moreGoogleAds','comments']}) + ,dict(name='div', attrs={'class':'cT-imageMultimedia'}) + ,dict(name=['object','embed','iframe']) + ] + remove_tags_after = [dict(name='div',attrs={'class':'articleBody'})] + keep_only_tags = [dict(name='div',attrs={'id':'content'})] + remove_attributes = ['width','height'] + def parse_index(self): + articles = [] + soup = self.index_to_soup('http://www.smh.com.au/todays-paper') + for itimg in soup.findAll('img',src=True): + if itimg['src'].endswith('frontpage.jpg'): + self.cover_url = itimg['src'] + for item in soup.findAll(attrs={'class':'cN-storyHeadlineLead cfix'}): + description = '' + title_prefix = '' + feed_link = item.find('a',href=True) + descript = item.find('p') + if descript: + description = self.tag_to_string(descript) + if feed_link: + url = feed_link['href'] + title = title_prefix + self.tag_to_string(feed_link) + date = strftime(self.timefmt) + articles.append({ + 'title' :title + ,'date' :date + ,'url' :url + ,'description':description + }) + return [(soup.head.title.string, articles)]