From 536a4eaf008b527a349137523c478c81c5f37722 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 31 Jan 2008 01:50:24 +0000 Subject: [PATCH] Add profiles for The Atlantic, The Christian Science Monitor, The Jerusalem Post and Reuters --- src/libprs500/ebooks/lrf/web/__init__.py | 11 +++- .../ebooks/lrf/web/profiles/atlantic.py | 59 +++++++++++++++++++ .../ebooks/lrf/web/profiles/chr_mon.py | 38 ++++++++++++ .../ebooks/lrf/web/profiles/jpost.py | 36 +++++++++++ .../ebooks/lrf/web/profiles/reuters.py | 39 ++++++++++++ 5 files changed, 180 insertions(+), 3 deletions(-) create mode 100644 src/libprs500/ebooks/lrf/web/profiles/atlantic.py create mode 100644 src/libprs500/ebooks/lrf/web/profiles/chr_mon.py create mode 100644 src/libprs500/ebooks/lrf/web/profiles/jpost.py create mode 100644 src/libprs500/ebooks/lrf/web/profiles/reuters.py diff --git a/src/libprs500/ebooks/lrf/web/__init__.py b/src/libprs500/ebooks/lrf/web/__init__.py index 2452d7b4d1..4eb0432c18 100644 --- a/src/libprs500/ebooks/lrf/web/__init__.py +++ b/src/libprs500/ebooks/lrf/web/__init__.py @@ -25,10 +25,15 @@ from libprs500.ebooks.lrf.web.profiles.wsj import WallStreetJournal from libprs500.ebooks.lrf.web.profiles.barrons import Barrons from libprs500.ebooks.lrf.web.profiles.portfolio import Portfolio from libprs500.ebooks.lrf.web.profiles.dilbert import Dilbert -from libprs500.ebooks.lrf.web.profiles.cnn import CNN +from libprs500.ebooks.lrf.web.profiles.cnn import CNN +from libprs500.ebooks.lrf.web.profiles.chr_mon import ChristianScienceMonitor +from libprs500.ebooks.lrf.web.profiles.jpost import JerusalemPost +from libprs500.ebooks.lrf.web.profiles.reuters import Reuters +from libprs500.ebooks.lrf.web.profiles.atlantic import Atlantic -builtin_profiles = [Barrons, BBC, CNN, Dilbert, Economist, FazNet, Newsweek, NewYorkReviewOfBooks, NYTimes, \ - Portfolio, SpiegelOnline, WallStreetJournal, ZeitNachrichten, \ +builtin_profiles = [Atlantic, Barrons, BBC, ChristianScienceMonitor, CNN, Dilbert, Economist, FazNet, + JerusalemPost, Newsweek, NewYorkReviewOfBooks, NYTimes, + Portfolio, Reuters, SpiegelOnline, WallStreetJournal, ZeitNachrichten, ] available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles] \ No newline at end of file diff --git a/src/libprs500/ebooks/lrf/web/profiles/atlantic.py b/src/libprs500/ebooks/lrf/web/profiles/atlantic.py new file mode 100644 index 0000000000..6bd3944db6 --- /dev/null +++ b/src/libprs500/ebooks/lrf/web/profiles/atlantic.py @@ -0,0 +1,59 @@ +## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +import re +from libprs500.ebooks.lrf.web.profiles import DefaultProfile +from libprs500.ebooks.BeautifulSoup import BeautifulSoup + +class Atlantic(DefaultProfile): + + title = 'The Atlantic' + max_recursions = 2 + INDEX = 'http://www.theatlantic.com/doc/current' + + preprocess_regexps = [ + (re.compile(r'
.*?' , lambda match : ''), + (r'.*?', lambda match : ''), + (r'.*?', lambda match : ''), + (r'', lambda match : ''), + (r'.*?
', lambda match : '
'), + (r'
.*?
', lambda match : ''), + (r'
.*?', lambda match : ''), + + ] + ] + + + + def get_feeds(self): + return [ ('Top News', 'http://rss.csmonitor.com/feeds/top'), + ('Terrorism', 'http://rss.csmonitor.com/terrorismSecurity'), + ('World', 'http://rss.csmonitor.com/feeds/world'), + ] + + + def print_version(self, url): + resolved_url = self.browser.open(url).geturl() + return resolved_url.strip()[:-1] \ No newline at end of file diff --git a/src/libprs500/ebooks/lrf/web/profiles/jpost.py b/src/libprs500/ebooks/lrf/web/profiles/jpost.py new file mode 100644 index 0000000000..d72e63c645 --- /dev/null +++ b/src/libprs500/ebooks/lrf/web/profiles/jpost.py @@ -0,0 +1,36 @@ +import re +from libprs500.ebooks.lrf.web.profiles import DefaultProfile + +class JerusalemPost(DefaultProfile): + + title = 'Jerusalem Post' + max_recursions = 2 + max_articles_per_feed = 10 + + + + preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in +[ + (r'.*?' , lambda match : ''), + (r'.*?', lambda match : ''), + (r'.*?', lambda match : ''), + (r'.*?', lambda match : ''), + (r'
', lambda match : ''), + (r'\'NWAnews.com', lambda match : ''), + (r'', lambda match : ''), + (r'

.*?', lambda match : ''), + + ] + ] + + def get_feeds(self): + return [ ('Front Page', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333346'), + ('Israel News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463156'), + ('Middle East News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333498'), + ('International News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463144'), + ('Editorials', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333211'), + ] + + def print_version(self, url): + return ('http://www.jpost.com/servlet/Satellite?cid=' + url.rpartition('&')[2] + '&pagename=JPost%2FJPArticle%2FPrinter') + diff --git a/src/libprs500/ebooks/lrf/web/profiles/reuters.py b/src/libprs500/ebooks/lrf/web/profiles/reuters.py new file mode 100644 index 0000000000..449138e76d --- /dev/null +++ b/src/libprs500/ebooks/lrf/web/profiles/reuters.py @@ -0,0 +1,39 @@ +import re +from libprs500.ebooks.lrf.web.profiles import DefaultProfile + + +class Reuters(DefaultProfile): + + title = 'Reuters' + max_recursions = 2 + max_articles_per_feed = 10 + html_description = True + + + preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in +[ + ##(r'.*?' , lambda match : ''), + (r'

.*?', lambda match : ''), + (r'.*?', lambda match : ''), + (r'.*?', lambda match : ''), + (r'.*?
', lambda match : ''), + (r'

Share:

.*?', lambda match : ''), + (r'
.*?
', lambda match : '
'), + ] + ] + + + + def get_feeds(self): + return [ ('Top Stories', 'http://feeds.reuters.com/reuters/topNews?format=xml'), + ('US News', 'http://feeds.reuters.com/reuters/domesticNews?format=xml'), + ('World News', 'http://feeds.reuters.com/reuters/worldNews?format=xml'), + ('Politics News', 'http://feeds.reuters.com/reuters/politicsNews?format=xml'), + ('Science News', 'http://feeds.reuters.com/reuters/scienceNews?format=xml'), + ('Emviroment News', 'http://feeds.reuters.com/reuters/Environment?format=xml'), + ('Technology News', 'http://feeds.reuters.com/reuters/technologyNews?format=xml'), + ('Oddly Enough News', 'http://feeds.reuters.com/reuters/oddlyEnoughNews?format=xml') + ] + + def print_version(self, url): + return ('http://www.reuters.com/article/id' + url + '?sp=true')