Improved recipe for the Sydney Morning Herald

2025-07-09 03:04:10 -04:00 · 2009-11-20 21:53:55 -07:00 · 2009-11-20 21:53:55 -07:00 · 79f223845d
commit 79f223845d
parent 7f4dcb8827
1 changed files with 65 additions and 30 deletions
--- a/resources/recipes/smh.recipe
+++ b/resources/recipes/smh.recipe
@ -6,51 +6,86 @@ __docformat__ = 'restructuredtext en'
 '''
 smh.com.au
 '''
-from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup


 class SMH(BasicNewsRecipe):

    title = 'Sydney Morning Herald'
    description = 'Business News, World News and Breaking News in Australia'
-    __author__ = 'Kovid Goyal'
+    __author__ = 'Kovid Goyal and Sujata Raman'
    language = 'en_AU'

+    max_articles_per_feed = 100
+    no_stylesheets = True
+    use_embedded_content = False
+    no_javascript = True
+
+    timefmt               = ' [%A, %d %B, %Y]'
+    encoding = 'utf-8'
+
+    keep_only_tags = [dict(name='div', attrs ={'id':'content'})]
+    remove_tags     = [
+                        dict(name='div', attrs={'align' :'right'}),
+                        dict(name='p', attrs={'class' :'comments'}),
+                        dict(name='a', attrs={'class' :['more-photos','performerpromo']}),
+                        dict(name='img', attrs={'alt' :'aap'}),
+                        dict(name='div', attrs ={'id':['googleAds','moreGoogleAds','comments','footer','sidebar','austereopuff','adSpotIsland']}),
+                        dict(name='div', attrs ={'class':['article-links','wof','articleTools top','cN-multimediaGroup cfix','articleTools bottom']}),
+                        dict(name='div', attrs ={'class':['clear','adSpot-textboxgr1','adSpot-textBox','articleTools-c3 cfix','articleExtras-bottom','span-16 last']}),
+                        dict(name='div', attrs ={'class':[ 'sidebar span-5','cT-socialCommenting','cN-linkList','cN-topicSelector','cT-storyTools cfix','cT-imageMultimedia']}) ,
+                        dict(name='iframe'),
+                       ]
+
+    extra_css = '''
+                  h1{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large;}
+                  .cT-storyDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;}
+                  .articleBody{font-family:Arial,Helvetica,sans-serif; color:black;font-size:small;}
+                  .cT-imageLandscape{font-family:Arial,Helvetica,sans-serif; color:#333333 ;font-size:x-small;}
+                  .source{font-family:Arial,Helvetica,sans-serif; color:#333333 ;font-size:xx-small;}
+                  #content{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
+                  .pageprint{font-family:Arial,Helvetica,sans-serif;font-size:small;}
+                  #bylineDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;}
+                  .featurePic-wide{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
+                  #idfeaturepic{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
+                  h3{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;}
+                  h2{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;}
+                  h4{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;}
+                  h5{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;}
+                  body{font-family:Arial,Helvetica,sans-serif; font-size:x-small;}
+                '''
+
+
+    feeds          = [
+                      ('Top Stories', 'http://feeds.smh.com.au/rssheadlines/top.xml'),
+                      ('National', 'http://feeds.smh.com.au/rssheadlines/national.xml'),
+                      ('World', 'http://feeds.smh.com.au/rssheadlines/world.xml'),
+                      ('Business', 'http://www.smh.com.au/rssheadlines/business.xml'),
+                      ('National Times', 'http://www.smh.com.au/rssheadlines/opinion/article/rss.xml'),
+                      ('Entertainment', 'http://feeds.smh.com.au/rssheadlines/entertainment.xml'),
+                      ('Technology', 'http://feeds.smh.com.au/rssheadlines/technology.xml'),
+                      ('Sport', 'http://feeds.smh.com.au/rssheadlines/sport.xml'),
+                    ]
+
+    def preprocess_html(self, soup):
+        bod = soup.find('bod')
+        if bod is not None:
+            bod.tag = 'div'
+            p = soup.find(id='content')
+            bod.extract()
+            p.insert(len(p), bod)
+        return soup

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        br.set_handle_refresh(False)
        return br

-    def parse_index(self):
-
-        soup = BeautifulSoup(self.browser.open('http://www.smh.com.au/text/').read())
-
-        feeds, articles = [], []
-        feed = None
+    def get_article_url(self, article):
+        url = article.link
+        if 'media' in url:
+            url = ''
+        return url


-        for tag in soup.findAll(['h3', 'a']):
-            if tag.name == 'h3':
-                if articles:
-                    feeds.append((feed, articles))
-                    articles = []
-                feed = self.tag_to_string(tag)
-            elif feed is not None and tag.has_key('href') and tag['href'].strip():
-                url = tag['href'].strip()
-                if url.startswith('/'):
-                    url   = 'http://www.smh.com.au' + url
-                title = self.tag_to_string(tag)
-                articles.append({
-                                 'title': title,
-                                 'url'  : url,
-                                 'date' : strftime('%a, %d %b'),
-                                 'description' : '',
-                                 'content'     : '',
-                                 })
-
-        return feeds
-