Add profile for The New Yorker

This commit is contained in:
Kovid Goyal 2008-02-13 05:13:00 +00:00
parent a1389cfd57
commit c86e9afe5a
2 changed files with 58 additions and 1 deletions

View File

@ -31,10 +31,11 @@ from libprs500.ebooks.lrf.web.profiles.jpost import JerusalemPost
from libprs500.ebooks.lrf.web.profiles.reuters import Reuters
from libprs500.ebooks.lrf.web.profiles.atlantic import Atlantic
from libprs500.ebooks.lrf.web.profiles.ap import AssociatedPress
from libprs500.ebooks.lrf.web.profiles.newyorker import NewYorker
builtin_profiles = [Atlantic, AssociatedPress, Barrons, BBC,
ChristianScienceMonitor, CNN, Dilbert, Economist, FazNet,
JerusalemPost, Newsweek, NewYorkReviewOfBooks, NYTimes,
JerusalemPost, Newsweek, NewYorker, NewYorkReviewOfBooks, NYTimes,
Portfolio, Reuters, SpiegelOnline, WallStreetJournal, ZeitNachrichten,
]

View File

@ -0,0 +1,56 @@
'''
Profile to download Jutarnji.hr
'''
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
class NewYorker(DefaultProfile):
title = 'The New Yorker'
max_recursions = 2
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 20
html_description = True
no_stylesheets = True
oldest_article = 14
## Getting the print version
def print_version(self, url):
return url + '?printable=true'
preprocess_regexps = [
(re.compile(r'<body.*?<!-- start article content -->', re.IGNORECASE | re.DOTALL), lambda match : '<body>'),
(re.compile(r'<div class="utils"'),
lambda match : '<div class="utils" style="display:none"'),
(re.compile(r'<div class="articleRailLinks"'),
lambda match : '<div class="articleRailLinks" style="display:none"'),
(re.compile(r'<div id="keywords"'),
lambda match : '<div id="keywords" style="display:none"'),
(re.compile(r'<!-- end article body -->.*?</body>', re.IGNORECASE | re.DOTALL), lambda match : '</body>'),
(re.compile(r'<!-- start video content -->.*?<!-- end video content -->', re.IGNORECASE | re.DOTALL), lambda match : '<!-- start video content --><!-- end video content -->'),
]
## Comment out the feeds you don't want retrieved.
## Or add any new new RSS feed URL's here, sorted alphabetically when converted to LRF
## If you want one of these at the top, append a space in front of the name.
def get_feeds(self):
return [
('Online Only', 'http://feeds.newyorker.com/services/rss/feeds/online.xml'),
('The Talk Of The Town', 'http://feeds.newyorker.com/services/rss/feeds/talk.xml'),
('Reporting and Essays', 'http://feeds.newyorker.com/services/rss/feeds/reporting.xml'),
('Arts and Culture', 'http://feeds.newyorker.com/services/rss/feeds/arts.xml'),
('Humor', 'http://feeds.newyorker.com/services/rss/feeds/humor.xml'),
('Fiction and Poetry', 'http://feeds.newyorker.com/services/rss/feeds/fiction.xml'),
('Comment', 'http://feeds.newyorker.com/services/rss/feeds/comment.xml'),
('The Financial Page', 'http://feeds.newyorker.com/services/rss/feeds/financial.xml'),
('Politics', 'http://feeds.newyorker.com/services/rss/feeds/politics.xml'),
('Movies', 'http://feeds.newyorker.com/services/rss/feeds/movies.xml'),
('Books', 'http://feeds.newyorker.com/services/rss/feeds/books.xml'),
('Tables For Two', 'http://feeds.newyorker.com/services/rss/feeds/tables.xml'),
]