Add profiles for The Atlantic, The Christian Science Monitor, The Jerusalem Post and Reuters

2025-07-09 03:04:10 -04:00 · 2008-01-31 01:50:24 +00:00 · 2008-01-31 01:50:24 +00:00 · 536a4eaf00
commit 536a4eaf00
parent f7fe2201b8
5 changed files with 180 additions and 3 deletions
--- a/src/libprs500/ebooks/lrf/web/init.py
+++ b/src/libprs500/ebooks/lrf/web/init.py
@ -25,10 +25,15 @@ from libprs500.ebooks.lrf.web.profiles.wsj           import WallStreetJournal
 from libprs500.ebooks.lrf.web.profiles.barrons       import Barrons
 from libprs500.ebooks.lrf.web.profiles.portfolio     import Portfolio
 from libprs500.ebooks.lrf.web.profiles.dilbert       import Dilbert  
-from libprs500.ebooks.lrf.web.profiles.cnn           import CNN 
+from libprs500.ebooks.lrf.web.profiles.cnn           import CNN
+from libprs500.ebooks.lrf.web.profiles.chr_mon       import ChristianScienceMonitor
+from libprs500.ebooks.lrf.web.profiles.jpost         import JerusalemPost
+from libprs500.ebooks.lrf.web.profiles.reuters       import Reuters
+from libprs500.ebooks.lrf.web.profiles.atlantic      import Atlantic 

-builtin_profiles   = [Barrons, BBC, CNN, Dilbert, Economist, FazNet, Newsweek, NewYorkReviewOfBooks, NYTimes,  \
-                      Portfolio, SpiegelOnline, WallStreetJournal, ZeitNachrichten,   \
+builtin_profiles   = [Atlantic, Barrons, BBC, ChristianScienceMonitor, CNN, Dilbert, Economist, FazNet, 
+                      JerusalemPost, Newsweek, NewYorkReviewOfBooks, NYTimes,  
+                      Portfolio, Reuters, SpiegelOnline, WallStreetJournal, ZeitNachrichten,   
                     ]

 available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles]
--- a/src/libprs500/ebooks/lrf/web/profiles/atlantic.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/atlantic.py
@ -0,0 +1,59 @@
+##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+import re
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile
+from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+
+class Atlantic(DefaultProfile):
+    
+    title = 'The Atlantic'
+    max_recursions = 2
+    INDEX = 'http://www.theatlantic.com/doc/current'
+    
+    preprocess_regexps = [
+                          (re.compile(r'<body.*?<div id="storytop"', re.DOTALL|re.IGNORECASE), 
+                           lambda m: '<body><div id="storytop"')
+                          ]
+    
+    def parse_feeds(self):
+        articles = []
+        
+        src = self.browser.open(self.INDEX).read()
+        soup = BeautifulSoup(src)
+        
+        issue = soup.find('span', attrs={'class':'issue'})
+        if issue:
+            self.timefmt = ' [%s]'%self.tag_to_string(issue).rpartition('|')[-1].strip().replace('/', '-')
+        
+        for item in soup.findAll('div', attrs={'class':'item'}):
+            a = item.find('a')
+            if a and a.has_key('href'):
+                url = a['href']
+                url = 'http://www.theatlantic.com/'+url.replace('/doc', 'doc/print')
+                title = self.tag_to_string(a)
+                byline = item.find(attrs={'class':'byline'})
+                date = self.tag_to_string(byline) if byline else ''
+                description = ''
+                articles.append({
+                                 'title':title,
+                                 'date':date,
+                                 'url':url,
+                                 'description':description
+                                })
+                
+        
+        return {'Current Issue' : articles }
+        
+        
--- a/src/libprs500/ebooks/lrf/web/profiles/chr_mon.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/chr_mon.py
@ -0,0 +1,38 @@
+import re
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile
+
+class ChristianScienceMonitor(DefaultProfile):
+
+    title = 'Christian Science Monitor'
+    max_recursions = 2
+    max_articles_per_feed = 20
+    use_pubdate = False
+    html_description = True
+    html2lrf_options = ['--ignore-tables', '--base-font-size=8.0', '--wordspace=2.0',]
+
+    
+    preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+[
+        (r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
+        (r'<body class="apple-rss-no-unread-mode" onLoad="setup(null)">.*?<!-- start Entries -->', lambda match : '<BODY><!-- start Entries -->'),
+        (r'<!-- end Entries -->.*?</BODY>', lambda match : '<!-- end Entries --></BODY>'),
+        (r'<script>.*?</script>', lambda match : ''),
+        (r'<body>.*?<div class="portlet-container">', lambda match : '<body><div class="portlet-container">'),
+        (r'<div class="pubdate">.*?</div>', lambda match : ''),
+        (r'<div class="factbox">.*?</body>', lambda match : '</body>'),
+
+    ]
+    ]
+     
+
+  
+    def get_feeds(self):
+        return [ ('Top News', 'http://rss.csmonitor.com/feeds/top'),
+                  ('Terrorism', 'http://rss.csmonitor.com/terrorismSecurity'),
+                  ('World', 'http://rss.csmonitor.com/feeds/world'),
+               ] 
+          
+          
+    def print_version(self, url):
+        resolved_url = self.browser.open(url).geturl()
+        return resolved_url.strip()[:-1]  
--- a/src/libprs500/ebooks/lrf/web/profiles/jpost.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/jpost.py
@ -0,0 +1,36 @@
+import re
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile
+
+class JerusalemPost(DefaultProfile):
+
+    title = 'Jerusalem Post'
+    max_recursions = 2
+    max_articles_per_feed = 10
+    
+    
+    
+    preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+[
+        (r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
+        (r'<BODY.*?>.*?<!-- start Entries -->', lambda match : '<BODY><!-- start Entries -->'),
+        (r'<!-- end Entries -->.*?</BODY>', lambda match : '</BODY>'),
+        (r'<script.*?>.*?</script>', lambda match : ''),
+        (r'<div class="apple-rss-article apple-rss-read" onclick=.*?<div class="apple-rss-article-body">', lambda match : ''),
+        (r'<img src=\'/images/logo_NWAnews.gif\' alt=\'NWAnews.com :: Northwest Arkansas\' News Source\'.*?>', lambda match : ''),
+        (r'<img src=\'/images/logo_adg.gif\'.*?>', lambda match : ''),
+        (r'<P CLASS="smallprint">.*?</body>', lambda match : '</body>'),
+
+    ]
+    ]
+
+    def get_feeds(self):
+          return [ ('Front Page', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333346'),
+                     ('Israel News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463156'),
+                     ('Middle East News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333498'),
+                     ('International News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463144'),
+                     ('Editorials', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333211'),
+          ]
+          
+    def print_version(self, url):
+         return ('http://www.jpost.com/servlet/Satellite?cid=' + url.rpartition('&')[2] + '&pagename=JPost%2FJPArticle%2FPrinter')
+         
--- a/src/libprs500/ebooks/lrf/web/profiles/reuters.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/reuters.py
@ -0,0 +1,39 @@
+import re
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile
+
+
+class Reuters(DefaultProfile):
+
+    title = 'Reuters'
+    max_recursions = 2
+    max_articles_per_feed = 10
+    html_description = True
+
+    
+    preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+[
+        ##(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
+        (r'<div id="apple-rss-sidebar-background">.*?<!-- start Entries -->', lambda match : ''),
+        (r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
+        (r'<script.*?>.*?</script>', lambda match : ''),
+        (r'<body>.*?<div class="contentBand">', lambda match : '<body>'),
+        (r'<h3>Share:</h3>.*?</body>', lambda match : '<!-- END:: Shared Module id=36615 --></body>'),
+        (r'<div id="atools" class="articleTools">.*?<div class="linebreak">', lambda match : '<div class="linebreak">'),
+    ]
+    ]   
+     
+
+  
+    def get_feeds(self):
+        return [ ('Top Stories', 'http://feeds.reuters.com/reuters/topNews?format=xml'),
+                  ('US News', 'http://feeds.reuters.com/reuters/domesticNews?format=xml'),
+                  ('World News', 'http://feeds.reuters.com/reuters/worldNews?format=xml'),
+                  ('Politics News', 'http://feeds.reuters.com/reuters/politicsNews?format=xml'),
+                  ('Science News', 'http://feeds.reuters.com/reuters/scienceNews?format=xml'),
+                  ('Emviroment News', 'http://feeds.reuters.com/reuters/Environment?format=xml'),
+                  ('Technology News', 'http://feeds.reuters.com/reuters/technologyNews?format=xml'),
+                  ('Oddly Enough News', 'http://feeds.reuters.com/reuters/oddlyEnoughNews?format=xml')
+         ]
+         
+    def print_version(self, url):
+        return ('http://www.reuters.com/article/id' + url + '?sp=true')