#!/usr/bin/env python __license__ = 'GPL v3' __author__ = 'Kovid Goyal and Sujata Raman, Lorenzo Vigentini' __copyright__ = '2009, Kovid Goyal and Sujata Raman' __version__ = 'v1.02' __date__ = '10, January 2010' __description__ = 'Providing context and clarity on national and international news, peoples and cultures' '''csmonitor.com''' import re from calibre.web.feeds.news import BasicNewsRecipe class ChristianScienceMonitor(BasicNewsRecipe): __author__ = 'Kovid Goyal' description = 'Providing context and clarity on national and international news, peoples and cultures' cover_url = 'http://www.csmonitor.com/extension/csm_base/design/csm_design/images/csmlogo_179x46.gif' title = 'Christian Science Monitor' publisher = 'The Christian Science Monitor' category = 'News, politics, culture, economy, general interest' language = 'en' encoding = 'utf-8' timefmt = '[%a, %d %b, %Y]' oldest_article = 16 max_articles_per_feed = 20 use_embedded_content = False recursion = 10 remove_javascript = True no_stylesheets = True def append_page(self, soup, appendtag, position): nav = soup.find('div',attrs={'class':'navigation'}) if nav: pager = nav.findAll('a') for part in pager: if 'Next' in part: nexturl = ('http://www.csmonitor.com' + re.findall(r'href="(.*?)"', str(part))[0]) soup2 = self.index_to_soup(nexturl) texttag = soup2.find('div', attrs={'class': re.compile('list-article-.*')}) trash_c = soup2.findAll(attrs={'class': 'list-description'}) trash_h = soup2.h1 for tc in trash_c: tc.extract() trash_h.extract() newpos = len(texttag.contents) self.append_page(soup2, texttag, newpos) texttag.extract() appendtag.insert(position, texttag) def preprocess_html(self, soup): PRINT_RE = re.compile(r'/layout/set/print/content/view/print/[0-9]*') html = str(soup) try: print_found = PRINT_RE.findall(html) except Exception: pass if print_found: print_url = 'http://www.csmonitor.com' + print_found[0] print_soup = self.index_to_soup(print_url) else: self.append_page(soup, soup.body, 3) trash_a = soup.findAll(attrs={'class': re.compile('navigation.*')}) trash_b = soup.findAll(attrs={'style': re.compile('.*')}) trash_d = soup.findAll(attrs={'class': 'sByline'}) for ta in trash_a: ta.extract() for tb in trash_b: tb.extract() for td in trash_d: td.extract() print_soup = soup return print_soup preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ (r'', lambda match : ''), (r'