Add profiles for USA Today and Jutarnji

This commit is contained in:
Kovid Goyal 2008-02-13 05:39:54 +00:00
parent c86e9afe5a
commit f570901c62
3 changed files with 93 additions and 2 deletions

View File

@ -32,11 +32,15 @@ from libprs500.ebooks.lrf.web.profiles.reuters import Reuters
from libprs500.ebooks.lrf.web.profiles.atlantic import Atlantic
from libprs500.ebooks.lrf.web.profiles.ap import AssociatedPress
from libprs500.ebooks.lrf.web.profiles.newyorker import NewYorker
from libprs500.ebooks.lrf.web.profiles.jutarnji import Jutarnji
from libprs500.ebooks.lrf.web.profiles.usatoday import USAToday
builtin_profiles = [Atlantic, AssociatedPress, Barrons, BBC,
ChristianScienceMonitor, CNN, Dilbert, Economist, FazNet,
JerusalemPost, Newsweek, NewYorker, NewYorkReviewOfBooks, NYTimes,
Portfolio, Reuters, SpiegelOnline, WallStreetJournal, ZeitNachrichten,
JerusalemPost, Jutarnji, Newsweek, NewYorker,
NewYorkReviewOfBooks, NYTimes, USAToday,
Portfolio, Reuters, SpiegelOnline, WallStreetJournal,
ZeitNachrichten,
]
available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles]

View File

@ -0,0 +1,44 @@
'''
Profile to download Jutarnji.hr by Valloric
'''
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
class Jutarnji(DefaultProfile):
title = 'Jutarnji'
max_recursions = 2
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 20
html_description = True
no_stylesheets = True
preprocess_regexps = [
(re.compile(r'<body.*?<span class="vijestnaslov">', re.IGNORECASE | re.DOTALL), lambda match : '<body><span class="vijestnaslov">'),
(re.compile(r'</div>.*?</td>', re.IGNORECASE | re.DOTALL), lambda match : '</div></td>'),
(re.compile(r'<a name="addComment.*?</body>', re.IGNORECASE | re.DOTALL), lambda match : '</body>'),
(re.compile(r'<br>', re.IGNORECASE | re.DOTALL), lambda match : ''),
]
## Getting the print version
def print_version(self, url):
return 'http://www.jutarnji.hr/ispis_clanka.jl?artid=' + url[len(url)-9:len(url)-3]
## Comment out the feeds you don't want retrieved.
## Or add any new new RSS feed URL's here, sorted alphabetically when converted to LRF
## If you want one of these at the top, append a space in front of the name.
def get_feeds(self):
return [
(' Naslovnica', 'http://www.jutarnji.hr/rss'),
('Sport', 'http://www.jutarnji.hr/sport/rss'),
('Novac', 'http://www.jutarnji.hr/novac/rss'),
('Kultura i zivot', 'http://www.jutarnji.hr/kultura_i_zivot/rss'),
('Automoto', 'http://www.jutarnji.hr/auto_moto/rss'),
('Hi-Tech', 'http://www.jutarnji.hr/kultura_i_zivot/hi-tech/rss'),
('Dom i nekretnine', 'http://www.jutarnji.hr/nekretnine/rss'),
]

View File

@ -0,0 +1,43 @@
'''
Profile to download Jutarnji.hr by Valloric
'''
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
class USAToday(DefaultProfile):
title = 'USA Today'
max_recursions = 2
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 20
html_description = True
#no_stylesheets = True
preprocess_regexps = [
(re.compile(r'<BODY.*?<!--Article Goes Here-->', re.IGNORECASE | re.DOTALL), lambda match : '<BODY>'),
(re.compile(r'<!--Article End-->.*?</BODY>', re.IGNORECASE | re.DOTALL), lambda match : '</BODY>'),
]
## Getting the print version
def print_version(self, url):
return 'http://www.printthis.clickability.com/pt/printThis?clickMap=printThis&fb=Y&url=' + url
## Comment out the feeds you don't want retrieved.
## Or add any new new RSS feed URL's here, sorted alphabetically when converted to LRF
## If you want one of these at the top, append a space in front of the name.
def get_feeds(self):
return [
(' Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'),
('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'),
('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'),
('Travel Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'),
('Money Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'),
('Entertainment Headlines', 'http://rssfeeds.usatoday.com/usatoday-LifeTopStories'),
('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'),
('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'),
]