#!/usr/bin/env python __license__ = 'GPL v3' __author__ = 'Kovid Goyal and Sujata Raman, Lorenzo Vigentini' __copyright__ = '2009, Kovid Goyal and Sujata Raman' __version__ = 'v1.02' __date__ = '10, January 2010' __description__ = 'Providing context and clarity on national and international news, peoples and cultures' '''csmonitor.com''' import re from calibre.web.feeds.news import BasicNewsRecipe class ChristianScienceMonitor(BasicNewsRecipe): __author__ = 'Kovid Goyal' description = 'Providing context and clarity on national and international news, peoples and cultures' cover_url = 'http://www.csmonitor.com/extension/csm_base/design/csm_design/images/csmlogo_179x46.gif' title = 'Christian Science Monitor' publisher = 'The Christian Science Monitor' category = 'News, politics, culture, economy, general interest' language = 'en' encoding = 'utf-8' timefmt = '[%a, %d %b, %Y]' oldest_article = 16 max_articles_per_feed = 20 use_embedded_content = False recursion = 10 remove_javascript = True no_stylesheets = True requires_version = (0, 8, 39) def preprocess_raw_html(self, raw, url): try: from html5lib import parse root = parse(raw, namespaceHTMLElements=False, treebuilder='lxml').getroot() from lxml import etree for tag in root.xpath( '//script|//style|//noscript|//meta|//link|//object'): tag.getparent().remove(tag) for elem in list(root.iterdescendants(tag=etree.Comment)): elem.getparent().remove(elem) ans = etree.tostring(root, encoding=unicode) ans = re.sub('.*