Add profiles for USA Today and Jutarnji

This commit is contained in:
Kovid Goyal 2008-02-13 05:39:54 +00:00
parent c86e9afe5a
commit f570901c62
3 changed files with 93 additions and 2 deletions

View File

@ -32,11 +32,15 @@ from libprs500.ebooks.lrf.web.profiles.reuters import Reuters
from libprs500.ebooks.lrf.web.profiles.atlantic import Atlantic from libprs500.ebooks.lrf.web.profiles.atlantic import Atlantic
from libprs500.ebooks.lrf.web.profiles.ap import AssociatedPress from libprs500.ebooks.lrf.web.profiles.ap import AssociatedPress
from libprs500.ebooks.lrf.web.profiles.newyorker import NewYorker from libprs500.ebooks.lrf.web.profiles.newyorker import NewYorker
from libprs500.ebooks.lrf.web.profiles.jutarnji import Jutarnji
from libprs500.ebooks.lrf.web.profiles.usatoday import USAToday
builtin_profiles = [Atlantic, AssociatedPress, Barrons, BBC, builtin_profiles = [Atlantic, AssociatedPress, Barrons, BBC,
ChristianScienceMonitor, CNN, Dilbert, Economist, FazNet, ChristianScienceMonitor, CNN, Dilbert, Economist, FazNet,
JerusalemPost, Newsweek, NewYorker, NewYorkReviewOfBooks, NYTimes, JerusalemPost, Jutarnji, Newsweek, NewYorker,
Portfolio, Reuters, SpiegelOnline, WallStreetJournal, ZeitNachrichten, NewYorkReviewOfBooks, NYTimes, USAToday,
Portfolio, Reuters, SpiegelOnline, WallStreetJournal,
ZeitNachrichten,
] ]
available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles] available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles]

View File

@ -0,0 +1,44 @@
'''
Profile to download Jutarnji.hr by Valloric
'''
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
class Jutarnji(DefaultProfile):
title = 'Jutarnji'
max_recursions = 2
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 20
html_description = True
no_stylesheets = True
preprocess_regexps = [
(re.compile(r'<body.*?<span class="vijestnaslov">', re.IGNORECASE | re.DOTALL), lambda match : '<body><span class="vijestnaslov">'),
(re.compile(r'</div>.*?</td>', re.IGNORECASE | re.DOTALL), lambda match : '</div></td>'),
(re.compile(r'<a name="addComment.*?</body>', re.IGNORECASE | re.DOTALL), lambda match : '</body>'),
(re.compile(r'<br>', re.IGNORECASE | re.DOTALL), lambda match : ''),
]
## Getting the print version
def print_version(self, url):
return 'http://www.jutarnji.hr/ispis_clanka.jl?artid=' + url[len(url)-9:len(url)-3]
## Comment out the feeds you don't want retrieved.
## Or add any new new RSS feed URL's here, sorted alphabetically when converted to LRF
## If you want one of these at the top, append a space in front of the name.
def get_feeds(self):
return [
(' Naslovnica', 'http://www.jutarnji.hr/rss'),
('Sport', 'http://www.jutarnji.hr/sport/rss'),
('Novac', 'http://www.jutarnji.hr/novac/rss'),
('Kultura i zivot', 'http://www.jutarnji.hr/kultura_i_zivot/rss'),
('Automoto', 'http://www.jutarnji.hr/auto_moto/rss'),
('Hi-Tech', 'http://www.jutarnji.hr/kultura_i_zivot/hi-tech/rss'),
('Dom i nekretnine', 'http://www.jutarnji.hr/nekretnine/rss'),
]

View File

@ -0,0 +1,43 @@
'''
Profile to download Jutarnji.hr by Valloric
'''
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
class USAToday(DefaultProfile):
title = 'USA Today'
max_recursions = 2
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 20
html_description = True
#no_stylesheets = True
preprocess_regexps = [
(re.compile(r'<BODY.*?<!--Article Goes Here-->', re.IGNORECASE | re.DOTALL), lambda match : '<BODY>'),
(re.compile(r'<!--Article End-->.*?</BODY>', re.IGNORECASE | re.DOTALL), lambda match : '</BODY>'),
]
## Getting the print version
def print_version(self, url):
return 'http://www.printthis.clickability.com/pt/printThis?clickMap=printThis&fb=Y&url=' + url
## Comment out the feeds you don't want retrieved.
## Or add any new new RSS feed URL's here, sorted alphabetically when converted to LRF
## If you want one of these at the top, append a space in front of the name.
def get_feeds(self):
return [
(' Top Headlines', 'http://rssfeeds.usatoday.com/usatoday-NewsTopStories'),
('Sport Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomSports-TopStories'),
('Tech Headlines', 'http://rssfeeds.usatoday.com/usatoday-TechTopStories'),
('Travel Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomTravel-TopStories'),
('Money Headlines', 'http://rssfeeds.usatoday.com/UsatodaycomMoney-TopStories'),
('Entertainment Headlines', 'http://rssfeeds.usatoday.com/usatoday-LifeTopStories'),
('Weather Headlines', 'http://rssfeeds.usatoday.com/usatoday-WeatherTopStories'),
('Most Popular', 'http://rssfeeds.usatoday.com/Usatoday-MostViewedArticles'),
]