diff --git a/src/libprs500/ebooks/lrf/web/profiles/chr_mon.py b/src/libprs500/ebooks/lrf/web/profiles/chr_mon.py index 0fc068af42..42dbfc1dc7 100644 --- a/src/libprs500/ebooks/lrf/web/profiles/chr_mon.py +++ b/src/libprs500/ebooks/lrf/web/profiles/chr_mon.py @@ -1,38 +1,46 @@ -import re + +import re, time from libprs500.ebooks.lrf.web.profiles import DefaultProfile +from libprs500.ebooks.BeautifulSoup import BeautifulSoup class ChristianScienceMonitor(DefaultProfile): title = 'Christian Science Monitor' max_recursions = 2 max_articles_per_feed = 20 - use_pubdate = False - html_description = True - html2lrf_options = ['--ignore-tables', '--base-font-size=8.0', '--wordspace=2.0',] + no_stylesheets = True + + preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in -[ - (r'.*?' , lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'', lambda match : ''), - (r'.*?
', lambda match : '
'), - (r'
.*?
', lambda match : ''), - (r'
.*?', lambda match : ''), - - ] - ] + [ + (r'
.*?
', lambda m: ''), + (r'Full HTML version of this story which may include photos, graphics, and related links.*', + lambda match : ''), + ]] - - def get_feeds(self): - return [ ('Top News', 'http://rss.csmonitor.com/feeds/top'), - ('Terrorism', 'http://rss.csmonitor.com/terrorismSecurity'), - ('World', 'http://rss.csmonitor.com/feeds/world'), - ] - - - def print_version(self, url): - resolved_url = self.browser.open(url).geturl() - return resolved_url.strip()[:-1] \ No newline at end of file + def parse_feeds(self): + soup = BeautifulSoup(self.browser.open('http://www.csmonitor.com/textedition')) + articles = {} + feed = [] + for tag in soup.findAll(['h2', 'p']): + if tag.name == 'h2': + title = self.tag_to_string(tag) + feed = [] + articles[title] = feed + elif tag.has_key('class') and tag['class'] == 'story': + a = tag.find('a') + if a is not None and a.has_key('href'): + feed.append({ + 'title': self.tag_to_string(a), + 'url' : 'http://www.csmonitor.com'+a['href'], + 'date' : time.strftime('%d %b'), + 'content' : '', + }) + a.extract() + feed[-1]['description'] = self.tag_to_string(tag).strip() + return articles +