Update Dawn

This commit is contained in:
Kovid Goyal 2014-09-07 17:31:52 +05:30
parent 82f03dca32
commit 20e861dfaf

View File

@ -16,22 +16,19 @@ class DawnRecipe(BasicNewsRecipe):
remove_empty_feeds = True remove_empty_feeds = True
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
#auto_cleanup = True
#auto_cleanup_keep = '//dix[@class="slideshow"]'
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
encoding = 'utf-8' encoding = 'utf-8'
keep_only_tags = [dict(name='div', attrs={'class':'push-half--sides push--top'}),
dict(name='article', attrs={'class':'story story--single push-half'})]
# Feeds from http://www.dawn.com/wps/wcm/connect/dawn-content-library/dawn/services/rss # Feeds from http://www.dawn.com/wps/wcm/connect/dawn-content-library/dawn/services/rss
feeds = [] feeds = []
feeds.append((u'Latest News', u'http://feedproxy.google.com/Dawn-All-News')) feeds.append((u'Latest News', u'http://feeds.feedburner.com/dawn-news'))
feeds.append((u'Pakistan News', u'http://feeds2.feedburner.com/dawn/news/pakistan'))
feeds.append((u'World News', u'http://feeds2.feedburner.com/dawn/news/world'))
feeds.append((u'Business News', u'http://feeds2.feedburner.com/dawn/news/business'))
feeds.append((u'Sport News', u'http://feeds2.feedburner.com/dawn/news/sport'))
feeds.append((u'Cricket News', u'http://feeds2.feedburner.com/dawn/news/cricket'))
feeds.append((u'Sci-tech News', u'http://feeds2.feedburner.com/dawn/news/technology'))
feeds.append((u'Entertainment News', u'http://feeds2.feedburner.com/dawn/news/entertainment'))
feeds.append((u'Columnists', u'http://feeds2.feedburner.com/dawn/news/columnists'))
#feeds.append((u'', u'')) #feeds.append((u'', u''))
conversion_options = {'comments': description, 'tags': category, 'language': 'en', conversion_options = {'comments': description, 'tags': category, 'language': 'en',
@ -45,48 +42,50 @@ class DawnRecipe(BasicNewsRecipe):
span.news_byline {font-size: x-small; color: #696969; margin-top: 1em;} span.news_byline {font-size: x-small; color: #696969; margin-top: 1em;}
''' '''
def print_version(self, url): #def print_version(self, url):
return url + '?pagedesign=Dawn_PrintlyFriendlyPage' #url = url.split('?')[0] + '/print'
#print(url)
#return url
def preprocess_html(self, soup): #def preprocess_html(self, soup):
newBody = Tag(soup, 'body') #newBody = Tag(soup, 'body')
for cl in ['page_title', 'news_headline', 'news_byline']: #for cl in ['page_title', 'news_headline', 'news_byline']:
tag = soup.find('span', attrs = {'class': cl}) #tag = soup.find('span', attrs = {'class': cl})
if tag: #if tag:
# They like their <br> tags; I don't: does not work well on small screens. ## They like their <br> tags; I don't: does not work well on small screens.
if tag['class'] == 'news_byline': #if tag['class'] == 'news_byline':
for br in tag.findAll('br'): #for br in tag.findAll('br'):
br.extract() #br.extract()
newBody.append(tag) #newBody.append(tag)
table = soup.find('table', attrs = {'id': 'body table'}) #table = soup.find('table', attrs = {'id': 'body table'})
if table: #if table:
for td in table.findAll('td', attrs = {'class': 'news_story'}): #for td in table.findAll('td', attrs = {'class': 'news_story'}):
for tag in td.findAll(True): #for tag in td.findAll(True):
if tag.has_key('id') and tag['id'] == 'banner-img_slide': #if tag.has_key('id') and tag['id'] == 'banner-img_slide':
tag.extract() #tag.extract()
elif tag.has_key('style'): #elif tag.has_key('style'):
del tag['style'] #del tag['style']
elif tag.name == 'script': #elif tag.name == 'script':
tag.extract() #tag.extract()
# They like their <br> tags; I don't: does not work well on small screens. ## They like their <br> tags; I don't: does not work well on small screens.
center = td.find('center') #center = td.find('center')
if center: #if center:
for br in center.findNextSiblings('br'): #for br in center.findNextSiblings('br'):
br.extract() #br.extract()
for br in center.findPreviousSiblings('br'): #for br in center.findPreviousSiblings('br'):
br.extract() #br.extract()
for attr in ['align', 'valign']: #for attr in ['align', 'valign']:
if td.has_key(attr): #if td.has_key(attr):
del td[attr] #del td[attr]
td.name = 'div' #td.name = 'div'
newBody.append(td) #newBody.append(td)
soup.body.replaceWith(newBody) #soup.body.replaceWith(newBody)
return soup #return soup