Add profiles for Washington Post and United Press International

2025-08-30 23:00:21 -04:00 · 2008-02-14 05:13:03 +00:00 · 2008-02-14 05:13:03 +00:00 · 45d114bc32
commit 45d114bc32
parent 0c61a51a1b
3 changed files with 84 additions and 2 deletions
--- a/src/libprs500/ebooks/lrf/web/init.py
+++ b/src/libprs500/ebooks/lrf/web/init.py
@ -34,13 +34,15 @@ from libprs500.ebooks.lrf.web.profiles.ap            import AssociatedPress
 from libprs500.ebooks.lrf.web.profiles.newyorker     import NewYorker 
 from libprs500.ebooks.lrf.web.profiles.jutarnji      import Jutarnji
 from libprs500.ebooks.lrf.web.profiles.usatoday      import USAToday
+from libprs500.ebooks.lrf.web.profiles.upi           import UnitedPressInternational 
+from libprs500.ebooks.lrf.web.profiles.wash_post     import WashingtonPost 

 builtin_profiles   = [Atlantic, AssociatedPress, Barrons, BBC, 
                      ChristianScienceMonitor, CNN, Dilbert, Economist, FazNet, 
                      JerusalemPost, Jutarnji, Newsweek, NewYorker, 
-                      NewYorkReviewOfBooks, NYTimes, USAToday,  
+                      NewYorkReviewOfBooks, NYTimes, UnitedPressInternational, USAToday,  
                      Portfolio, Reuters, SpiegelOnline, WallStreetJournal, 
-                      ZeitNachrichten,   
+                      WashingtonPost, ZeitNachrichten,   
                     ]

 available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles]
--- a/src/libprs500/ebooks/lrf/web/profiles/upi.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/upi.py
@ -0,0 +1,36 @@
+import re
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile
+
+
+class UnitedPressInternational(DefaultProfile):
+
+    title = 'United Press International'
+    max_recursions = 2
+    max_articles_per_feed = 15
+    html2lrf_options = ['--override-css= "H1 {font-family: Arial; font-weight: bold; color: #000000; size: 10pt;}"']
+
+    
+    preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+                          [
+    	(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
+    	(r'<div id="apple-rss-sidebar-background">.*?<!-- start Entries -->', lambda match : ''),
+    	(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
+    	(r'<script.*?>.*?</script>', lambda match : ''),
+    	(r'<body onload=.*?>.*?<a href="http://www.upi.com">', lambda match : '<body style="font: 8pt arial;">'),
+    	##(r'<div class=\'headerDIV\'><h2><a style="color: #990000;" href="http://www.upi.com/NewsTrack/Top_News/">Top News</a></h2></div>.*?<br clear="all">', lambda match : ''),
+    	(r'<script src="http://www.g.*?>.*?</body>', lambda match : ''),
+    	(r'<span style="font: 16pt arial', lambda match : '<span style="font: 12pt arial'),
+     ]
+    ]   
+     
+
+  
+    def get_feeds(self):
+        return [ ('Top Stories', 'http://www.upi.com/rss/NewsTrack/Top_News/'),
+     	         ('Science', 'http://www.upi.com/rss/NewsTrack/Science/'),
+     	         ('Heatlth', 'http://www.upi.com/rss/NewsTrack/Health/'),
+     	         ('Quirks', 'http://www.upi.com/rss/NewsTrack/Quirks/'),
+     	]
+    
+    def print_version(self, url):
+        return (url + 'print_view/')
--- a/src/libprs500/ebooks/lrf/web/profiles/wash_post.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/wash_post.py
@ -0,0 +1,44 @@
+import re
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile
+
+
+class WashingtonPost(DefaultProfile):
+
+    title = 'Washington Post'
+    max_recursions = 2
+    max_articles_per_feed = 20
+    use_pubdate = False
+
+    
+    preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+[
+    	(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
+    	(r'<div id="apple-rss-sidebar-background">.*?<!-- start Entries -->', lambda match : ''),
+    	(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
+    	(r'<script.*?>.*?</script>', lambda match : ''),
+    	(r'<body.*?>.*?.correction {', lambda match : '<body><style>.correction {'),
+    	(r'<span class="display:none;" name="pubDate".*?>.*?</body>', lambda match : '<body>'),
+    	
+    	
+    ]
+    ]   
+     
+
+  
+    def get_feeds(self):
+        return [ ('Today\'s Highlights', 'http://www.washingtonpost.com/wp-dyn/rss/linkset/2005/03/24/LI2005032400102.xml'),
+     	         ('Politics', 'http://www.washingtonpost.com/wp-dyn/rss/politics/index.xml'),
+     	         ('Nation', 'http://www.www.washingtonpost.com/wp-dyn/rss/nation/index.xml'),
+     	         ('World', 'http://www.washingtonpost.com/wp-dyn/rss/world/index.xml'),
+     	         ('Business', 'http://www.washingtonpost.com/wp-dyn/rss/business/index.xml'),
+     	         ('Technology', 'http://www.washingtonpost.com/wp-dyn/rss/technology/index.xml'),
+     	         ('Health', 'http://www.washingtonpost.com/wp-dyn/rss/health/index.xml'),
+     	         ('Education', 'http://www.washingtonpost.com/wp-dyn/rss/education/index.xml'),
+     	         ('Editorials', 'http://www.washingtonpost.com/wp-dyn/rss/linkset/2005/05/30/LI2005053000331.xml'),
+     	]
+
+
+
+    def print_version(self, url):
+        return (url.rpartition('.')[0] + '_pf.html')
+