Add profiles for The Atlantic, The Christian Science Monitor, The Jerusalem Post and Reuters

This commit is contained in:
Kovid Goyal 2008-01-31 01:50:24 +00:00
parent f7fe2201b8
commit 536a4eaf00
5 changed files with 180 additions and 3 deletions

View File

@ -25,10 +25,15 @@ from libprs500.ebooks.lrf.web.profiles.wsj import WallStreetJournal
from libprs500.ebooks.lrf.web.profiles.barrons import Barrons from libprs500.ebooks.lrf.web.profiles.barrons import Barrons
from libprs500.ebooks.lrf.web.profiles.portfolio import Portfolio from libprs500.ebooks.lrf.web.profiles.portfolio import Portfolio
from libprs500.ebooks.lrf.web.profiles.dilbert import Dilbert from libprs500.ebooks.lrf.web.profiles.dilbert import Dilbert
from libprs500.ebooks.lrf.web.profiles.cnn import CNN from libprs500.ebooks.lrf.web.profiles.cnn import CNN
from libprs500.ebooks.lrf.web.profiles.chr_mon import ChristianScienceMonitor
from libprs500.ebooks.lrf.web.profiles.jpost import JerusalemPost
from libprs500.ebooks.lrf.web.profiles.reuters import Reuters
from libprs500.ebooks.lrf.web.profiles.atlantic import Atlantic
builtin_profiles = [Barrons, BBC, CNN, Dilbert, Economist, FazNet, Newsweek, NewYorkReviewOfBooks, NYTimes, \ builtin_profiles = [Atlantic, Barrons, BBC, ChristianScienceMonitor, CNN, Dilbert, Economist, FazNet,
Portfolio, SpiegelOnline, WallStreetJournal, ZeitNachrichten, \ JerusalemPost, Newsweek, NewYorkReviewOfBooks, NYTimes,
Portfolio, Reuters, SpiegelOnline, WallStreetJournal, ZeitNachrichten,
] ]
available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles] available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles]

View File

@ -0,0 +1,59 @@
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
class Atlantic(DefaultProfile):
title = 'The Atlantic'
max_recursions = 2
INDEX = 'http://www.theatlantic.com/doc/current'
preprocess_regexps = [
(re.compile(r'<body.*?<div id="storytop"', re.DOTALL|re.IGNORECASE),
lambda m: '<body><div id="storytop"')
]
def parse_feeds(self):
articles = []
src = self.browser.open(self.INDEX).read()
soup = BeautifulSoup(src)
issue = soup.find('span', attrs={'class':'issue'})
if issue:
self.timefmt = ' [%s]'%self.tag_to_string(issue).rpartition('|')[-1].strip().replace('/', '-')
for item in soup.findAll('div', attrs={'class':'item'}):
a = item.find('a')
if a and a.has_key('href'):
url = a['href']
url = 'http://www.theatlantic.com/'+url.replace('/doc', 'doc/print')
title = self.tag_to_string(a)
byline = item.find(attrs={'class':'byline'})
date = self.tag_to_string(byline) if byline else ''
description = ''
articles.append({
'title':title,
'date':date,
'url':url,
'description':description
})
return {'Current Issue' : articles }

View File

@ -0,0 +1,38 @@
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
class ChristianScienceMonitor(DefaultProfile):
title = 'Christian Science Monitor'
max_recursions = 2
max_articles_per_feed = 20
use_pubdate = False
html_description = True
html2lrf_options = ['--ignore-tables', '--base-font-size=8.0', '--wordspace=2.0',]
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
(r'<body class="apple-rss-no-unread-mode" onLoad="setup(null)">.*?<!-- start Entries -->', lambda match : '<BODY><!-- start Entries -->'),
(r'<!-- end Entries -->.*?</BODY>', lambda match : '<!-- end Entries --></BODY>'),
(r'<script>.*?</script>', lambda match : ''),
(r'<body>.*?<div class="portlet-container">', lambda match : '<body><div class="portlet-container">'),
(r'<div class="pubdate">.*?</div>', lambda match : ''),
(r'<div class="factbox">.*?</body>', lambda match : '</body>'),
]
]
def get_feeds(self):
return [ ('Top News', 'http://rss.csmonitor.com/feeds/top'),
('Terrorism', 'http://rss.csmonitor.com/terrorismSecurity'),
('World', 'http://rss.csmonitor.com/feeds/world'),
]
def print_version(self, url):
resolved_url = self.browser.open(url).geturl()
return resolved_url.strip()[:-1]

View File

@ -0,0 +1,36 @@
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
class JerusalemPost(DefaultProfile):
title = 'Jerusalem Post'
max_recursions = 2
max_articles_per_feed = 10
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
(r'<BODY.*?>.*?<!-- start Entries -->', lambda match : '<BODY><!-- start Entries -->'),
(r'<!-- end Entries -->.*?</BODY>', lambda match : '</BODY>'),
(r'<script.*?>.*?</script>', lambda match : ''),
(r'<div class="apple-rss-article apple-rss-read" onclick=.*?<div class="apple-rss-article-body">', lambda match : ''),
(r'<img src=\'/images/logo_NWAnews.gif\' alt=\'NWAnews.com :: Northwest Arkansas\' News Source\'.*?>', lambda match : ''),
(r'<img src=\'/images/logo_adg.gif\'.*?>', lambda match : ''),
(r'<P CLASS="smallprint">.*?</body>', lambda match : '</body>'),
]
]
def get_feeds(self):
return [ ('Front Page', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333346'),
('Israel News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463156'),
('Middle East News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333498'),
('International News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463144'),
('Editorials', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333211'),
]
def print_version(self, url):
return ('http://www.jpost.com/servlet/Satellite?cid=' + url.rpartition('&')[2] + '&pagename=JPost%2FJPArticle%2FPrinter')

View File

@ -0,0 +1,39 @@
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
class Reuters(DefaultProfile):
title = 'Reuters'
max_recursions = 2
max_articles_per_feed = 10
html_description = True
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
##(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
(r'<div id="apple-rss-sidebar-background">.*?<!-- start Entries -->', lambda match : ''),
(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
(r'<script.*?>.*?</script>', lambda match : ''),
(r'<body>.*?<div class="contentBand">', lambda match : '<body>'),
(r'<h3>Share:</h3>.*?</body>', lambda match : '<!-- END:: Shared Module id=36615 --></body>'),
(r'<div id="atools" class="articleTools">.*?<div class="linebreak">', lambda match : '<div class="linebreak">'),
]
]
def get_feeds(self):
return [ ('Top Stories', 'http://feeds.reuters.com/reuters/topNews?format=xml'),
('US News', 'http://feeds.reuters.com/reuters/domesticNews?format=xml'),
('World News', 'http://feeds.reuters.com/reuters/worldNews?format=xml'),
('Politics News', 'http://feeds.reuters.com/reuters/politicsNews?format=xml'),
('Science News', 'http://feeds.reuters.com/reuters/scienceNews?format=xml'),
('Emviroment News', 'http://feeds.reuters.com/reuters/Environment?format=xml'),
('Technology News', 'http://feeds.reuters.com/reuters/technologyNews?format=xml'),
('Oddly Enough News', 'http://feeds.reuters.com/reuters/oddlyEnoughNews?format=xml')
]
def print_version(self, url):
return ('http://www.reuters.com/article/id' + url + '?sp=true')