Fix recipe for The Christian Science Monitor

This commit is contained in:
Kovid Goyal 2010-01-11 19:42:16 -07:00
parent 63a1434d62
commit f8840debaf

View File

@ -1,19 +1,38 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Kovid Goyal and Sujata Raman, Lorenzo Vigentini'
__copyright__ = '2009, Kovid Goyal and Sujata Raman'
__version__ = 'v1.02'
__date__ = '10, January 2010'
__description__ = 'Providing context and clarity on national and international news, peoples and cultures'
'''csmonitor.com'''
import re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class ChristianScienceMonitor(BasicNewsRecipe):
title = 'Christian Science Monitor'
author = 'Kovid Goyal, Sujata Raman and Lorenzo Vigentini'
description = 'Providing context and clarity on national and international news, peoples and cultures'
max_articles_per_feed = 20
__author__ = 'Kovid Goyal and Sujata Raman'
cover_url = 'http://www.csmonitor.com/extension/csm_base/design/csm_design/images/csmlogo_179x46.gif'
title = 'Christian Science Monitor'
publisher = 'The Christian Science Monitor'
category = 'News, politics, culture, economy, general interest'
language = 'en'
encoding = 'utf-8'
no_stylesheets = True
use_embedded_content = False
timefmt = '[%a, %d %b, %Y]'
oldest_article = 16
max_articles_per_feed = 20
use_embedded_content = False
recursion = 10
remove_javascript = True
no_stylesheets = True
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
@ -55,33 +74,15 @@ class ChristianScienceMonitor(BasicNewsRecipe):
]
keep_only_tags = [
dict(name='div', attrs={'id':['story','main']}),
dict(name='div', attrs={'id':'mainColumn'}),
]
remove_tags = [
dict(name='div', attrs={'id':['story-tools','videoPlayer','storyRelatedBottom','enlarge-photo','photo-paginate']}),
dict(name='div', attrs={'class':[ 'spacer3','divvy spacer7','comment','storyIncludeBottom']}),
dict(name='div', attrs={'class':['storyToolbar cfx','podStoryRel','spacer3','divvy spacer7','comment','storyIncludeBottom']}),
dict(name='ul', attrs={'class':[ 'centerliststories']}) ,
dict(name='form', attrs={'id':[ 'commentform']}) ,
]
remove_tags_after = [ dict(name='div', attrs={'class':[ 'ad csmAd']})]
def find_articles(self, section):
ans = []
for x in section.findAll('head4'):
title = ' '.join(x.findAll(text=True)).strip()
a = x.find('a')
if not a: continue
href = a['href']
ans.append({'title':title, 'url':href, 'description':'', 'date': strftime('%a, %d %b')})
#for x in ans:
# x['url'] += '/output/print'
return ans
def postprocess_html(self, soup, first_fetch):
html = soup.find('html')
if html is None:
return soup
html.extract()
return html