mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add profiles for The Atlantic, The Christian Science Monitor, The Jerusalem Post and Reuters
This commit is contained in:
parent
f7fe2201b8
commit
536a4eaf00
@ -25,10 +25,15 @@ from libprs500.ebooks.lrf.web.profiles.wsj import WallStreetJournal
|
||||
from libprs500.ebooks.lrf.web.profiles.barrons import Barrons
|
||||
from libprs500.ebooks.lrf.web.profiles.portfolio import Portfolio
|
||||
from libprs500.ebooks.lrf.web.profiles.dilbert import Dilbert
|
||||
from libprs500.ebooks.lrf.web.profiles.cnn import CNN
|
||||
from libprs500.ebooks.lrf.web.profiles.cnn import CNN
|
||||
from libprs500.ebooks.lrf.web.profiles.chr_mon import ChristianScienceMonitor
|
||||
from libprs500.ebooks.lrf.web.profiles.jpost import JerusalemPost
|
||||
from libprs500.ebooks.lrf.web.profiles.reuters import Reuters
|
||||
from libprs500.ebooks.lrf.web.profiles.atlantic import Atlantic
|
||||
|
||||
builtin_profiles = [Barrons, BBC, CNN, Dilbert, Economist, FazNet, Newsweek, NewYorkReviewOfBooks, NYTimes, \
|
||||
Portfolio, SpiegelOnline, WallStreetJournal, ZeitNachrichten, \
|
||||
builtin_profiles = [Atlantic, Barrons, BBC, ChristianScienceMonitor, CNN, Dilbert, Economist, FazNet,
|
||||
JerusalemPost, Newsweek, NewYorkReviewOfBooks, NYTimes,
|
||||
Portfolio, Reuters, SpiegelOnline, WallStreetJournal, ZeitNachrichten,
|
||||
]
|
||||
|
||||
available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles]
|
59
src/libprs500/ebooks/lrf/web/profiles/atlantic.py
Normal file
59
src/libprs500/ebooks/lrf/web/profiles/atlantic.py
Normal file
@ -0,0 +1,59 @@
|
||||
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
import re
|
||||
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class Atlantic(DefaultProfile):
|
||||
|
||||
title = 'The Atlantic'
|
||||
max_recursions = 2
|
||||
INDEX = 'http://www.theatlantic.com/doc/current'
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'<body.*?<div id="storytop"', re.DOTALL|re.IGNORECASE),
|
||||
lambda m: '<body><div id="storytop"')
|
||||
]
|
||||
|
||||
def parse_feeds(self):
|
||||
articles = []
|
||||
|
||||
src = self.browser.open(self.INDEX).read()
|
||||
soup = BeautifulSoup(src)
|
||||
|
||||
issue = soup.find('span', attrs={'class':'issue'})
|
||||
if issue:
|
||||
self.timefmt = ' [%s]'%self.tag_to_string(issue).rpartition('|')[-1].strip().replace('/', '-')
|
||||
|
||||
for item in soup.findAll('div', attrs={'class':'item'}):
|
||||
a = item.find('a')
|
||||
if a and a.has_key('href'):
|
||||
url = a['href']
|
||||
url = 'http://www.theatlantic.com/'+url.replace('/doc', 'doc/print')
|
||||
title = self.tag_to_string(a)
|
||||
byline = item.find(attrs={'class':'byline'})
|
||||
date = self.tag_to_string(byline) if byline else ''
|
||||
description = ''
|
||||
articles.append({
|
||||
'title':title,
|
||||
'date':date,
|
||||
'url':url,
|
||||
'description':description
|
||||
})
|
||||
|
||||
|
||||
return {'Current Issue' : articles }
|
||||
|
||||
|
38
src/libprs500/ebooks/lrf/web/profiles/chr_mon.py
Normal file
38
src/libprs500/ebooks/lrf/web/profiles/chr_mon.py
Normal file
@ -0,0 +1,38 @@
|
||||
import re
|
||||
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
class ChristianScienceMonitor(DefaultProfile):
|
||||
|
||||
title = 'Christian Science Monitor'
|
||||
max_recursions = 2
|
||||
max_articles_per_feed = 20
|
||||
use_pubdate = False
|
||||
html_description = True
|
||||
html2lrf_options = ['--ignore-tables', '--base-font-size=8.0', '--wordspace=2.0',]
|
||||
|
||||
|
||||
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
|
||||
(r'<body class="apple-rss-no-unread-mode" onLoad="setup(null)">.*?<!-- start Entries -->', lambda match : '<BODY><!-- start Entries -->'),
|
||||
(r'<!-- end Entries -->.*?</BODY>', lambda match : '<!-- end Entries --></BODY>'),
|
||||
(r'<script>.*?</script>', lambda match : ''),
|
||||
(r'<body>.*?<div class="portlet-container">', lambda match : '<body><div class="portlet-container">'),
|
||||
(r'<div class="pubdate">.*?</div>', lambda match : ''),
|
||||
(r'<div class="factbox">.*?</body>', lambda match : '</body>'),
|
||||
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
|
||||
def get_feeds(self):
|
||||
return [ ('Top News', 'http://rss.csmonitor.com/feeds/top'),
|
||||
('Terrorism', 'http://rss.csmonitor.com/terrorismSecurity'),
|
||||
('World', 'http://rss.csmonitor.com/feeds/world'),
|
||||
]
|
||||
|
||||
|
||||
def print_version(self, url):
|
||||
resolved_url = self.browser.open(url).geturl()
|
||||
return resolved_url.strip()[:-1]
|
36
src/libprs500/ebooks/lrf/web/profiles/jpost.py
Normal file
36
src/libprs500/ebooks/lrf/web/profiles/jpost.py
Normal file
@ -0,0 +1,36 @@
|
||||
import re
|
||||
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
class JerusalemPost(DefaultProfile):
|
||||
|
||||
title = 'Jerusalem Post'
|
||||
max_recursions = 2
|
||||
max_articles_per_feed = 10
|
||||
|
||||
|
||||
|
||||
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
|
||||
(r'<BODY.*?>.*?<!-- start Entries -->', lambda match : '<BODY><!-- start Entries -->'),
|
||||
(r'<!-- end Entries -->.*?</BODY>', lambda match : '</BODY>'),
|
||||
(r'<script.*?>.*?</script>', lambda match : ''),
|
||||
(r'<div class="apple-rss-article apple-rss-read" onclick=.*?<div class="apple-rss-article-body">', lambda match : ''),
|
||||
(r'<img src=\'/images/logo_NWAnews.gif\' alt=\'NWAnews.com :: Northwest Arkansas\' News Source\'.*?>', lambda match : ''),
|
||||
(r'<img src=\'/images/logo_adg.gif\'.*?>', lambda match : ''),
|
||||
(r'<P CLASS="smallprint">.*?</body>', lambda match : '</body>'),
|
||||
|
||||
]
|
||||
]
|
||||
|
||||
def get_feeds(self):
|
||||
return [ ('Front Page', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333346'),
|
||||
('Israel News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463156'),
|
||||
('Middle East News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333498'),
|
||||
('International News', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463144'),
|
||||
('Editorials', 'http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333211'),
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return ('http://www.jpost.com/servlet/Satellite?cid=' + url.rpartition('&')[2] + '&pagename=JPost%2FJPArticle%2FPrinter')
|
||||
|
39
src/libprs500/ebooks/lrf/web/profiles/reuters.py
Normal file
39
src/libprs500/ebooks/lrf/web/profiles/reuters.py
Normal file
@ -0,0 +1,39 @@
|
||||
import re
|
||||
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
||||
|
||||
|
||||
class Reuters(DefaultProfile):
|
||||
|
||||
title = 'Reuters'
|
||||
max_recursions = 2
|
||||
max_articles_per_feed = 10
|
||||
html_description = True
|
||||
|
||||
|
||||
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
##(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
|
||||
(r'<div id="apple-rss-sidebar-background">.*?<!-- start Entries -->', lambda match : ''),
|
||||
(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
|
||||
(r'<script.*?>.*?</script>', lambda match : ''),
|
||||
(r'<body>.*?<div class="contentBand">', lambda match : '<body>'),
|
||||
(r'<h3>Share:</h3>.*?</body>', lambda match : '<!-- END:: Shared Module id=36615 --></body>'),
|
||||
(r'<div id="atools" class="articleTools">.*?<div class="linebreak">', lambda match : '<div class="linebreak">'),
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
|
||||
def get_feeds(self):
|
||||
return [ ('Top Stories', 'http://feeds.reuters.com/reuters/topNews?format=xml'),
|
||||
('US News', 'http://feeds.reuters.com/reuters/domesticNews?format=xml'),
|
||||
('World News', 'http://feeds.reuters.com/reuters/worldNews?format=xml'),
|
||||
('Politics News', 'http://feeds.reuters.com/reuters/politicsNews?format=xml'),
|
||||
('Science News', 'http://feeds.reuters.com/reuters/scienceNews?format=xml'),
|
||||
('Emviroment News', 'http://feeds.reuters.com/reuters/Environment?format=xml'),
|
||||
('Technology News', 'http://feeds.reuters.com/reuters/technologyNews?format=xml'),
|
||||
('Oddly Enough News', 'http://feeds.reuters.com/reuters/oddlyEnoughNews?format=xml')
|
||||
]
|
||||
|
||||
def print_version(self, url):
|
||||
return ('http://www.reuters.com/article/id' + url + '?sp=true')
|
Loading…
x
Reference in New Issue
Block a user