Updated Sydney Morning Herald

2025-07-09 03:04:10 -04:00 · 2010-04-04 22:03:27 +05:30 · 2010-04-04 22:03:27 +05:30 · bd1ee60717
commit bd1ee60717
parent 0953c48b08
2 changed files with 56 additions and 81 deletions
--- a/resources/images/news/smh.png
+++ b/resources/images/news/smh.png
--- a/resources/recipes/smh.recipe
+++ b/resources/recipes/smh.recipe
@ -1,91 +1,66 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
 __docformat__ = 'restructuredtext en'
 '''
 smh.com.au
 '''
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
 class Smh_au(BasicNewsRecipe):
    title                 = 'The Sydney Morning Herald - Printed edition'
    __author__            = 'Darko Miletic'
    description           = 'Breaking news from Sydney, Australia and the world. Features the latest business, sport, entertainment, travel, lifestyle, and technology news.'
    publisher             = 'Fairfax Digital'
    category              = 'news, politics, Australia, Sydney'
    oldest_article        = 2
    max_articles_per_feed = 200
    no_stylesheets        = True
    encoding              = 'utf-8'
    use_embedded_content  = False
    language              = 'en_AU'
    remove_empty_feeds    = True
    masthead_url          = 'http://images.smh.com.au/2010/02/02/1087188/smh-620.jpg'
    publication_type      = 'newspaper'
    extra_css             = ' h1{font-family: Georgia,"Times New Roman",Times,serif } body{font-family: Arial,Helvetica,sans-serif} .cT-imageLandscape{font-size: x-small} '
-class SMH(BasicNewsRecipe):
+    conversion_options = {
-
+                          'comment'   : description
-    title = 'Sydney Morning Herald'
+                        , 'tags'      : category
-    description = 'Business News, World News and Breaking News in Australia'
+                        , 'publisher' : publisher
-    __author__ = 'Kovid Goyal and Sujata Raman'
+                        , 'language'  : language
-    language = 'en_AU'
+                        }
    max_articles_per_feed = 100
    no_stylesheets = True
    use_embedded_content = False
    no_javascript = True
    timefmt               = ' [%A, %d %B, %Y]'
    encoding = 'utf-8'
    keep_only_tags = [dict(name='div', attrs ={'id':'content'})]
    remove_tags     = [
                        dict(name='div', attrs={'align' :'right'}),
                        dict(name='p', attrs={'class' :'comments'}),
                        dict(name='a', attrs={'class' :['more-photos','performerpromo']}),
                        dict(name='img', attrs={'alt' :'aap'}),
                        dict(name='div', attrs ={'id':['googleAds','moreGoogleAds','comments','footer','sidebar','austereopuff','adSpotIsland']}),
                        dict(name='div', attrs ={'class':['article-links','wof','articleTools top','cN-multimediaGroup cfix','articleTools bottom']}),
                        dict(name='div', attrs ={'class':['clear','adSpot-textboxgr1','adSpot-textBox','articleTools-c3 cfix','articleExtras-bottom','span-16 last']}),
                        dict(name='div', attrs ={'class':[ 'sidebar span-5','cT-socialCommenting','cN-linkList','cN-topicSelector','cT-storyTools cfix','cT-imageMultimedia']}) ,
                        dict(name='iframe'),
                       ]
    extra_css = '''
                  h1{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large;}
                  .cT-storyDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;}
                  .articleBody{font-family:Arial,Helvetica,sans-serif; color:black;font-size:small;}
                  .cT-imageLandscape{font-family:Arial,Helvetica,sans-serif; color:#333333 ;font-size:x-small;}
                  .source{font-family:Arial,Helvetica,sans-serif; color:#333333 ;font-size:xx-small;}
                  #content{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
                  .pageprint{font-family:Arial,Helvetica,sans-serif;font-size:small;}
                  #bylineDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;}
                  .featurePic-wide{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
                  #idfeaturepic{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
                  h3{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;}
                  h2{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;}
                  h4{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;}
                  h5{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;}
                  body{font-family:Arial,Helvetica,sans-serif; font-size:x-small;}
                '''
    feeds          = [
                      ('Top Stories', 'http://feeds.smh.com.au/rssheadlines/top.xml'),
                      ('National', 'http://feeds.smh.com.au/rssheadlines/national.xml'),
                      ('World', 'http://feeds.smh.com.au/rssheadlines/world.xml'),
                      ('Business', 'http://www.smh.com.au/rssheadlines/business.xml'),
                      ('National Times', 'http://www.smh.com.au/rssheadlines/opinion/article/rss.xml'),
                      ('Entertainment', 'http://feeds.smh.com.au/rssheadlines/entertainment.xml'),
                      ('Technology', 'http://feeds.smh.com.au/rssheadlines/technology.xml'),
                      ('Sport', 'http://feeds.smh.com.au/rssheadlines/sport.xml'),
                    ]
    def preprocess_html(self, soup):
        bod = soup.find('bod')
        if bod is not None:
            bod.tag = 'div'
            p = soup.find(id='content')
            bod.extract()
            p.insert(len(p), bod)
        return soup
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        br.set_handle_refresh(False)
        return br
    def get_article_url(self, article):
        url = article.link
        if 'media' in url:
            url = ''
        return url
    remove_tags = [
                     dict(name='div', attrs={'id':['googleAds','moreGoogleAds','comments']})
                    ,dict(name='div', attrs={'class':'cT-imageMultimedia'})
                    ,dict(name=['object','embed','iframe'])
                  ]
    remove_tags_after = [dict(name='div',attrs={'class':'articleBody'})]
    keep_only_tags    = [dict(name='div',attrs={'id':'content'})]
    remove_attributes = ['width','height']
    def parse_index(self):
        articles = []
        soup = self.index_to_soup('http://www.smh.com.au/todays-paper')
        for itimg in soup.findAll('img',src=True):
            if itimg['src'].endswith('frontpage.jpg'):
               self.cover_url = itimg['src']
        for item in soup.findAll(attrs={'class':'cN-storyHeadlineLead cfix'}):
            description = ''
            title_prefix = ''
            feed_link = item.find('a',href=True)
            descript = item.find('p')
            if descript:
               description = self.tag_to_string(descript)
            if feed_link:
                url   = feed_link['href']
                title = title_prefix + self.tag_to_string(feed_link)
                date  = strftime(self.timefmt)
                articles.append({
                                  'title'      :title
                                 ,'date'       :date
                                 ,'url'        :url
                                 ,'description':description
                                })
        return [(soup.head.title.string, articles)]