From 893035b874434a512f8fc54eeb4596c46dacae38 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 20 Mar 2011 11:21:01 -0600 Subject: [PATCH] Fix Christian Science Monitor --- recipes/chr_mon.recipe | 66 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/recipes/chr_mon.recipe b/recipes/chr_mon.recipe index 2b431ebd0b..6f41b95763 100644 --- a/recipes/chr_mon.recipe +++ b/recipes/chr_mon.recipe @@ -8,13 +8,13 @@ __description__ = 'Providing context and clarity on national and international n '''csmonitor.com''' - import re from calibre.web.feeds.news import BasicNewsRecipe + class ChristianScienceMonitor(BasicNewsRecipe): - author = 'Kovid Goyal, Sujata Raman and Lorenzo Vigentini' + __author__ = 'Kovid Goyal' description = 'Providing context and clarity on national and international news, peoples and cultures' cover_url = 'http://www.csmonitor.com/extension/csm_base/design/csm_design/images/csmlogo_179x46.gif' @@ -34,6 +34,49 @@ class ChristianScienceMonitor(BasicNewsRecipe): remove_javascript = True no_stylesheets = True + def append_page(self, soup, appendtag, position): + nav = soup.find('div',attrs={'class':'navigation'}) + if nav: + pager = nav.findAll('a') + for part in pager: + if 'Next' in part: + nexturl = ('http://www.csmonitor.com' + + re.findall(r'href="(.*?)"', str(part))[0]) + soup2 = self.index_to_soup(nexturl) + texttag = soup2.find('div', + attrs={'class': re.compile('list-article-.*')}) + trash_c = soup2.findAll(attrs={'class': 'list-description'}) + trash_h = soup2.h1 + for tc in trash_c: tc.extract() + trash_h.extract() + + newpos = len(texttag.contents) + self.append_page(soup2, texttag, newpos) + texttag.extract() + appendtag.insert(position, texttag) + + def preprocess_html(self, soup): + PRINT_RE = re.compile(r'/layout/set/print/content/view/print/[0-9]*') + html = str(soup) + try: + print_found = PRINT_RE.findall(html) + except Exception: + pass + if print_found: + print_url = 'http://www.csmonitor.com' + print_found[0] + print_soup = self.index_to_soup(print_url) + else: + self.append_page(soup, soup.body, 3) + + trash_a = soup.findAll(attrs={'class': re.compile('navigation.*')}) + trash_b = soup.findAll(attrs={'style': re.compile('.*')}) + trash_d = soup.findAll(attrs={'class': 'sByline'}) + for ta in trash_a: ta.extract() + for tb in trash_b: tb.extract() + for td in trash_d: td.extract() + + print_soup = soup + return print_soup preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ @@ -43,7 +86,6 @@ class ChristianScienceMonitor(BasicNewsRecipe): (r'Full HTML version of this story which may include photos, graphics, and related links.*', lambda match : ''), ]] - extra_css = ''' h1{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: large} .sub{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: small;} @@ -56,10 +98,9 @@ class ChristianScienceMonitor(BasicNewsRecipe): #main{font-family:Arial,Tahoma,Verdana,Helvetica,sans-serif ; font-size: small; } #photo-details{ font-family:Arial,Helvetica,sans-serif ; color:#999999; font-size: x-small;} span.name{color:#205B87;font-family: Georgia,Times,"Times New Roman",serif; font-size: x-small} - p#dateline{color:#444444 ; font-family:Arial,Helvetica,sans-serif ; font-style:italic;} - ''' - feeds = [ - (u'Top Stories' , u'http://rss.csmonitor.com/feeds/top'), + p#dateline{color:#444444 ; font-family:Arial,Helvetica,sans-serif ; font-style:italic;} ''' + + feeds = [(u'Top Stories', u'http://rss.csmonitor.com/feeds/top'), (u'World' , u'http://rss.csmonitor.com/feeds/world'), (u'USA' , u'http://rss.csmonitor.com/feeds/usa'), (u'Commentary' , u'http://rss.csmonitor.com/feeds/commentary'), @@ -74,9 +115,7 @@ class ChristianScienceMonitor(BasicNewsRecipe): (u'Home Forum' , u'http://rss.csmonitor.com/feeds/homeforum') ] - keep_only_tags = [ - dict(name='div', attrs={'id':'mainColumn'}), - ] + keep_only_tags = [dict(name='div', attrs={'id':'mainColumn'}), ] remove_tags = [ dict(name='div', attrs={'id':['story-tools','videoPlayer','storyRelatedBottom','enlarge-photo','photo-paginate']}), @@ -86,7 +125,10 @@ class ChristianScienceMonitor(BasicNewsRecipe): 'hide', 'podBrdr']}), dict(name='ul', attrs={'class':[ 'centerliststories']}) , dict(name='form', attrs={'id':[ 'commentform']}) , + dict(name='div', attrs={'class': ['ui-comments']}) ] - remove_tags_after = [ dict(name='div', attrs={'class':[ 'ad csmAd']})] - + remove_tags_after = [ dict(name='div', attrs={'class':[ 'ad csmAd']}), + dict(name='div', attrs={'class': [re.compile('navigation.*')]}), + dict(name='div', attrs={'style': [re.compile('.*')]}) + ]