Update Dawn

2026-04-12 12:12:03 -04:00 · 2014-09-07 17:31:52 +05:30 · 2014-09-07 17:31:52 +05:30 · 20e861dfaf
commit 20e861dfaf
parent 82f03dca32
1 changed files with 44 additions and 45 deletions
--- a/recipes/dawn.recipe
+++ b/recipes/dawn.recipe
@ -16,22 +16,19 @@ class DawnRecipe(BasicNewsRecipe):
    remove_empty_feeds = True
    oldest_article = 2
    max_articles_per_feed = 100
+    #auto_cleanup = True
+    #auto_cleanup_keep = '//dix[@class="slideshow"]'
+    

    no_stylesheets = True
    remove_javascript = True
    encoding = 'utf-8'
+    keep_only_tags = [dict(name='div', attrs={'class':'push-half--sides  push--top'}),
+		      dict(name='article', attrs={'class':'story  story--single  push-half'})]

    # Feeds from http://www.dawn.com/wps/wcm/connect/dawn-content-library/dawn/services/rss
    feeds = []
-    feeds.append((u'Latest News', u'http://feedproxy.google.com/Dawn-All-News'))
-    feeds.append((u'Pakistan News', u'http://feeds2.feedburner.com/dawn/news/pakistan'))
-    feeds.append((u'World News', u'http://feeds2.feedburner.com/dawn/news/world'))
-    feeds.append((u'Business News', u'http://feeds2.feedburner.com/dawn/news/business'))
-    feeds.append((u'Sport News', u'http://feeds2.feedburner.com/dawn/news/sport'))
-    feeds.append((u'Cricket News', u'http://feeds2.feedburner.com/dawn/news/cricket'))
-    feeds.append((u'Sci-tech News', u'http://feeds2.feedburner.com/dawn/news/technology'))
-    feeds.append((u'Entertainment News', u'http://feeds2.feedburner.com/dawn/news/entertainment'))
-    feeds.append((u'Columnists', u'http://feeds2.feedburner.com/dawn/news/columnists'))
+    feeds.append((u'Latest News', u'http://feeds.feedburner.com/dawn-news'))
    #feeds.append((u'', u''))

    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
@ -45,48 +42,50 @@ class DawnRecipe(BasicNewsRecipe):
                span.news_byline {font-size: x-small; color: #696969; margin-top: 1em;}
                '''

-    def print_version(self, url):
-        return url + '?pagedesign=Dawn_PrintlyFriendlyPage'
+    #def print_version(self, url):
+        #url = url.split('?')[0] + '/print'
+        #print(url)
+        #return url

-    def preprocess_html(self, soup):
-        newBody = Tag(soup, 'body')
+    #def preprocess_html(self, soup):
+        #newBody = Tag(soup, 'body')

-        for cl in ['page_title', 'news_headline', 'news_byline']:
-            tag = soup.find('span', attrs = {'class': cl})
-            if tag:
-                # They like their <br> tags; I don't: does not work well on small screens.
-                if tag['class'] == 'news_byline':
-                    for br in tag.findAll('br'):
-                        br.extract()
+        #for cl in ['page_title', 'news_headline', 'news_byline']:
+            #tag = soup.find('span', attrs = {'class': cl})
+            #if tag:
+                ## They like their <br> tags; I don't: does not work well on small screens.
+                #if tag['class'] == 'news_byline':
+                    #for br in tag.findAll('br'):
+                        #br.extract()

-                newBody.append(tag)
+                #newBody.append(tag)

-        table = soup.find('table', attrs = {'id': 'body table'})
-        if table:
-            for td in table.findAll('td', attrs = {'class': 'news_story'}):
-                for tag in td.findAll(True):
-                    if tag.has_key('id') and tag['id'] == 'banner-img_slide':
-                        tag.extract()
-                    elif tag.has_key('style'):
-                        del tag['style']
-                    elif tag.name == 'script':
-                        tag.extract()
+        #table = soup.find('table', attrs = {'id': 'body table'})
+        #if table:
+            #for td in table.findAll('td', attrs = {'class': 'news_story'}):
+                #for tag in td.findAll(True):
+                    #if tag.has_key('id') and tag['id'] == 'banner-img_slide':
+                        #tag.extract()
+                    #elif tag.has_key('style'):
+                        #del tag['style']
+                    #elif tag.name == 'script':
+                        #tag.extract()

-                # They like their <br> tags; I don't: does not work well on small screens.
-                center = td.find('center')
-                if center:
-                    for br in center.findNextSiblings('br'):
-                        br.extract()
-                    for br in center.findPreviousSiblings('br'):
-                        br.extract()
+                ## They like their <br> tags; I don't: does not work well on small screens.
+                #center = td.find('center')
+                #if center:
+                    #for br in center.findNextSiblings('br'):
+                        #br.extract()
+                    #for br in center.findPreviousSiblings('br'):
+                        #br.extract()

-                for attr in ['align', 'valign']:
-                    if td.has_key(attr):
-                        del td[attr]
+                #for attr in ['align', 'valign']:
+                    #if td.has_key(attr):
+                        #del td[attr]

-                td.name = 'div'
-                newBody.append(td)
+                #td.name = 'div'
+                #newBody.append(td)

-            soup.body.replaceWith(newBody)
+            #soup.body.replaceWith(newBody)

-            return soup
+            #return soup