Fix recipe for The Christian Science Monitor

This commit is contained in:
Kovid Goyal 2010-01-11 19:42:16 -07:00
parent 63a1434d62
commit f8840debaf

View File

@ -1,19 +1,38 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Kovid Goyal and Sujata Raman, Lorenzo Vigentini'
__copyright__ = '2009, Kovid Goyal and Sujata Raman'
__version__ = 'v1.02'
__date__ = '10, January 2010'
__description__ = 'Providing context and clarity on national and international news, peoples and cultures'
'''csmonitor.com'''
import re import re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class ChristianScienceMonitor(BasicNewsRecipe): class ChristianScienceMonitor(BasicNewsRecipe):
title = 'Christian Science Monitor' author = 'Kovid Goyal, Sujata Raman and Lorenzo Vigentini'
description = 'Providing context and clarity on national and international news, peoples and cultures' description = 'Providing context and clarity on national and international news, peoples and cultures'
max_articles_per_feed = 20
__author__ = 'Kovid Goyal and Sujata Raman' cover_url = 'http://www.csmonitor.com/extension/csm_base/design/csm_design/images/csmlogo_179x46.gif'
title = 'Christian Science Monitor'
publisher = 'The Christian Science Monitor'
category = 'News, politics, culture, economy, general interest'
language = 'en' language = 'en'
encoding = 'utf-8' encoding = 'utf-8'
no_stylesheets = True timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
oldest_article = 16
max_articles_per_feed = 20
use_embedded_content = False
recursion = 10
remove_javascript = True
no_stylesheets = True
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
@ -55,33 +74,15 @@ class ChristianScienceMonitor(BasicNewsRecipe):
] ]
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'id':['story','main']}), dict(name='div', attrs={'id':'mainColumn'}),
] ]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'id':['story-tools','videoPlayer','storyRelatedBottom','enlarge-photo','photo-paginate']}), dict(name='div', attrs={'id':['story-tools','videoPlayer','storyRelatedBottom','enlarge-photo','photo-paginate']}),
dict(name='div', attrs={'class':[ 'spacer3','divvy spacer7','comment','storyIncludeBottom']}), dict(name='div', attrs={'class':['storyToolbar cfx','podStoryRel','spacer3','divvy spacer7','comment','storyIncludeBottom']}),
dict(name='ul', attrs={'class':[ 'centerliststories']}) , dict(name='ul', attrs={'class':[ 'centerliststories']}) ,
dict(name='form', attrs={'id':[ 'commentform']}) , dict(name='form', attrs={'id':[ 'commentform']}) ,
] ]
remove_tags_after = [ dict(name='div', attrs={'class':[ 'ad csmAd']})]
def find_articles(self, section):
ans = []
for x in section.findAll('head4'):
title = ' '.join(x.findAll(text=True)).strip()
a = x.find('a')
if not a: continue
href = a['href']
ans.append({'title':title, 'url':href, 'description':'', 'date': strftime('%a, %d %b')})
#for x in ans:
# x['url'] += '/output/print'
return ans
def postprocess_html(self, soup, first_fetch):
html = soup.find('html')
if html is None:
return soup
html.extract()
return html