Add profiles for Washington Post and United Press International

This commit is contained in:
Kovid Goyal 2008-02-14 05:13:03 +00:00
parent 0c61a51a1b
commit 45d114bc32
3 changed files with 84 additions and 2 deletions

View File

@ -34,13 +34,15 @@ from libprs500.ebooks.lrf.web.profiles.ap import AssociatedPress
from libprs500.ebooks.lrf.web.profiles.newyorker import NewYorker from libprs500.ebooks.lrf.web.profiles.newyorker import NewYorker
from libprs500.ebooks.lrf.web.profiles.jutarnji import Jutarnji from libprs500.ebooks.lrf.web.profiles.jutarnji import Jutarnji
from libprs500.ebooks.lrf.web.profiles.usatoday import USAToday from libprs500.ebooks.lrf.web.profiles.usatoday import USAToday
from libprs500.ebooks.lrf.web.profiles.upi import UnitedPressInternational
from libprs500.ebooks.lrf.web.profiles.wash_post import WashingtonPost
builtin_profiles = [Atlantic, AssociatedPress, Barrons, BBC, builtin_profiles = [Atlantic, AssociatedPress, Barrons, BBC,
ChristianScienceMonitor, CNN, Dilbert, Economist, FazNet, ChristianScienceMonitor, CNN, Dilbert, Economist, FazNet,
JerusalemPost, Jutarnji, Newsweek, NewYorker, JerusalemPost, Jutarnji, Newsweek, NewYorker,
NewYorkReviewOfBooks, NYTimes, USAToday, NewYorkReviewOfBooks, NYTimes, UnitedPressInternational, USAToday,
Portfolio, Reuters, SpiegelOnline, WallStreetJournal, Portfolio, Reuters, SpiegelOnline, WallStreetJournal,
ZeitNachrichten, WashingtonPost, ZeitNachrichten,
] ]
available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles] available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles]

View File

@ -0,0 +1,36 @@
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
class UnitedPressInternational(DefaultProfile):
title = 'United Press International'
max_recursions = 2
max_articles_per_feed = 15
html2lrf_options = ['--override-css= "H1 {font-family: Arial; font-weight: bold; color: #000000; size: 10pt;}"']
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
(r'<div id="apple-rss-sidebar-background">.*?<!-- start Entries -->', lambda match : ''),
(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
(r'<script.*?>.*?</script>', lambda match : ''),
(r'<body onload=.*?>.*?<a href="http://www.upi.com">', lambda match : '<body style="font: 8pt arial;">'),
##(r'<div class=\'headerDIV\'><h2><a style="color: #990000;" href="http://www.upi.com/NewsTrack/Top_News/">Top News</a></h2></div>.*?<br clear="all">', lambda match : ''),
(r'<script src="http://www.g.*?>.*?</body>', lambda match : ''),
(r'<span style="font: 16pt arial', lambda match : '<span style="font: 12pt arial'),
]
]
def get_feeds(self):
return [ ('Top Stories', 'http://www.upi.com/rss/NewsTrack/Top_News/'),
('Science', 'http://www.upi.com/rss/NewsTrack/Science/'),
('Heatlth', 'http://www.upi.com/rss/NewsTrack/Health/'),
('Quirks', 'http://www.upi.com/rss/NewsTrack/Quirks/'),
]
def print_version(self, url):
return (url + 'print_view/')

View File

@ -0,0 +1,44 @@
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
class WashingtonPost(DefaultProfile):
title = 'Washington Post'
max_recursions = 2
max_articles_per_feed = 20
use_pubdate = False
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
(r'<div id="apple-rss-sidebar-background">.*?<!-- start Entries -->', lambda match : ''),
(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
(r'<script.*?>.*?</script>', lambda match : ''),
(r'<body.*?>.*?.correction {', lambda match : '<body><style>.correction {'),
(r'<span class="display:none;" name="pubDate".*?>.*?</body>', lambda match : '<body>'),
]
]
def get_feeds(self):
return [ ('Today\'s Highlights', 'http://www.washingtonpost.com/wp-dyn/rss/linkset/2005/03/24/LI2005032400102.xml'),
('Politics', 'http://www.washingtonpost.com/wp-dyn/rss/politics/index.xml'),
('Nation', 'http://www.www.washingtonpost.com/wp-dyn/rss/nation/index.xml'),
('World', 'http://www.washingtonpost.com/wp-dyn/rss/world/index.xml'),
('Business', 'http://www.washingtonpost.com/wp-dyn/rss/business/index.xml'),
('Technology', 'http://www.washingtonpost.com/wp-dyn/rss/technology/index.xml'),
('Health', 'http://www.washingtonpost.com/wp-dyn/rss/health/index.xml'),
('Education', 'http://www.washingtonpost.com/wp-dyn/rss/education/index.xml'),
('Editorials', 'http://www.washingtonpost.com/wp-dyn/rss/linkset/2005/05/30/LI2005053000331.xml'),
]
def print_version(self, url):
return (url.rpartition('.')[0] + '_pf.html')