diff --git a/src/calibre/web/feeds/recipes/recipe_chr_mon.py b/src/calibre/web/feeds/recipes/recipe_chr_mon.py index e4c12cc931..f2fec1c24d 100644 --- a/src/calibre/web/feeds/recipes/recipe_chr_mon.py +++ b/src/calibre/web/feeds/recipes/recipe_chr_mon.py @@ -1,5 +1,6 @@ import re +from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class ChristianScienceMonitor(BasicNewsRecipe): @@ -7,43 +8,77 @@ class ChristianScienceMonitor(BasicNewsRecipe): title = 'Christian Science Monitor' description = 'Providing context and clarity on national and international news, peoples and cultures' max_articles_per_feed = 20 - __author__ = 'Kovid Goyal' + __author__ = 'Kovid Goyal and Sujata Raman' language = 'en' - + encoding = 'utf-8' no_stylesheets = True use_embedded_content = False - - - preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + + + preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ (r'
.*?
', lambda m: ''), (r'Full HTML version of this story which may include photos, graphics, and related links.*', lambda match : ''), ]] - - def parse_index(self): - soup = self.index_to_soup('http://www.csmonitor.com/textedition') - feeds = [] - for tag in soup.findAll(['h2', 'p']): - if tag.name == 'h2': - title = self.tag_to_string(tag) - feeds.append((title, [])) - elif tag.has_key('class') and tag['class'] == 'story' and feeds: - a = tag.find('a') - if a is not None and a.has_key('href'): - art = { - 'title': self.tag_to_string(a), - 'url' : 'http://www.csmonitor.com'+a['href'], - 'date' : '', - } - a.extract() - art['description'] = self.tag_to_string(tag).strip() - feeds[-1][1].append(art) - return feeds - + extra_css = ''' + h1{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: large} + .sub{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: small;} + .byline{ font-family:Arial,Helvetica,sans-serif ; color:#999999; font-size: x-small;} + .postdate{color:#999999 ; font-family:Arial,Helvetica,sans-serif ; font-size: x-small; } + h3{color:#999999 ; font-family:Arial,Helvetica,sans-serif ; font-size: x-small; } + .photoCutline{ color:#333333 ; font-family:Arial,Helvetica,sans-serif ; font-size: x-small; } + .photoCredit{ color:#999999 ; font-family:Arial,Helvetica,sans-serif ; font-size: x-small; } + #story{font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; font-size: small; } + #main{font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; font-size: small; } + #photo-details{ font-family:Arial,Helvetica,sans-serif ; color:#999999; font-size: x-small;} + span.name{color:#205B87;font-family: Georgia,Times,"Times New Roman",serif; font-size: x-small} + p#dateline{color:#444444 ; font-family:Arial,Helvetica,sans-serif ; font-style:italic;} + ''' + feeds = [ + (u'Top Stories' , u'http://rss.csmonitor.com/feeds/top'), + (u'World' , u'http://rss.csmonitor.com/feeds/world'), + (u'USA' , u'http://rss.csmonitor.com/feeds/usa'), + (u'Commentary' , u'http://rss.csmonitor.com/feeds/commentary'), + (u'Money' , u'http://rss.csmonitor.com/feeds/wam'), + (u'Learning' , u'http://rss.csmonitor.com/feeds/learning'), + (u'Living', u'http://rss.csmonitor.com/feeds/living'), + (u'Innovation', u'http://rss.csmonitor.com/feeds/scitech'), + (u'Gardening', u'http://rss.csmonitor.com/feeds/gardening'), + (u'Environment',u'http://rss.csmonitor.com/feeds/environment'), + (u'Arts', u'http://rss.csmonitor.com/feeds/arts'), + (u'Books', u'http://rss.csmonitor.com/feeds/books'), + (u'Home Forum' , u'http://rss.csmonitor.com/feeds/homeforum') + ] + + keep_only_tags = [ + dict(name='div', attrs={'id':['story','main']}), + ] + + remove_tags = [ + dict(name='div', attrs={'id':['story-tools','videoPlayer','storyRelatedBottom','enlarge-photo','photo-paginate']}), + dict(name='div', attrs={'class':[ 'spacer3','divvy spacer7','comment','storyIncludeBottom']}), + dict(name='ul', attrs={'class':[ 'centerliststories']}) , + dict(name='form', attrs={'id':[ 'commentform']}) , + ] + + + def find_articles(self, section): + ans = [] + for x in section.findAll('head4'): + title = ' '.join(x.findAll(text=True)).strip() + a = x.find('a') + if not a: continue + href = a['href'] + ans.append({'title':title, 'url':href, 'description':'', 'date': strftime('%a, %d %b')}) + + #for x in ans: + # x['url'] += '/output/print' + return ans + def postprocess_html(self, soup, first_fetch): html = soup.find('html') if html is None: