mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Add support for user profiles in web2lrf
This commit is contained in:
parent
275b59a2e7
commit
8799a6f3f2
@ -13,134 +13,3 @@
|
|||||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
import os, time, calendar, operator, re
|
|
||||||
|
|
||||||
from libprs500 import iswindows
|
|
||||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
|
|
||||||
from htmlentitydefs import name2codepoint
|
|
||||||
|
|
||||||
DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
|
|
||||||
MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12)
|
|
||||||
FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6,
|
|
||||||
July=7, August=8, September=9, October=10,
|
|
||||||
November=11, December=12)
|
|
||||||
|
|
||||||
def strptime(src):
|
|
||||||
src = src.strip().split()
|
|
||||||
src[0] = str(DAY_MAP[src[0][:-1]])+','
|
|
||||||
try:
|
|
||||||
src[2] = str(MONTH_MAP[src[2]])
|
|
||||||
except KeyError:
|
|
||||||
src[2] = str(FULL_MONTH_MAP[src[2]])
|
|
||||||
return time.strptime(' '.join(src), '%w, %d %m %Y %H:%M:%S %Z')
|
|
||||||
|
|
||||||
def process_html_description(tag):
|
|
||||||
src = '\n'.join(tag.contents)
|
|
||||||
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
|
|
||||||
for e in replaced_entities:
|
|
||||||
ent = '&'+e+';'
|
|
||||||
src = src.replace(ent, unichr(name2codepoint[e]))
|
|
||||||
return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
|
|
||||||
|
|
||||||
def parse_feeds(feeds, browser, print_version,
|
|
||||||
max_articles_per_feed=10,
|
|
||||||
html_description=False,
|
|
||||||
oldest_article=7):
|
|
||||||
'''
|
|
||||||
@param print_version: Callable that takes a url string and returns the url to
|
|
||||||
printable version of the article pointed to by the original url.
|
|
||||||
@param max_articles_per_feed: Maximum number of articles to download from each feed
|
|
||||||
@param html_description: If true the atricles descriptions are processed as HTML
|
|
||||||
@param oldest_article: A number in days. No articles older than now - oldest_aticle
|
|
||||||
will be downloaded.
|
|
||||||
'''
|
|
||||||
articles = {}
|
|
||||||
for title, url in feeds:
|
|
||||||
try:
|
|
||||||
src = browser.open(url).read()
|
|
||||||
except Exception, err:
|
|
||||||
print 'Could not fetch feed: %s\nError: %s'%(url, err)
|
|
||||||
continue
|
|
||||||
|
|
||||||
articles[title] = []
|
|
||||||
soup = BeautifulStoneSoup(src)
|
|
||||||
for item in soup.findAll('item'):
|
|
||||||
try:
|
|
||||||
pubdate = item.find('pubdate').string
|
|
||||||
if not pubdate:
|
|
||||||
continue
|
|
||||||
pubdate = pubdate.replace('+0000', 'GMT')
|
|
||||||
d = {
|
|
||||||
'title' : item.find('title').string,
|
|
||||||
'url' : print_version(item.find('guid').string),
|
|
||||||
'timestamp': calendar.timegm(strptime(pubdate)),
|
|
||||||
'date' : pubdate
|
|
||||||
}
|
|
||||||
delta = time.time() - d['timestamp']
|
|
||||||
if delta > oldest_article*3600*24:
|
|
||||||
continue
|
|
||||||
|
|
||||||
except Exception, err:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
desc = item.find('description')
|
|
||||||
d['description'] = process_html_description(desc) if html_description else desc.string
|
|
||||||
except:
|
|
||||||
d['description'] = ''
|
|
||||||
articles[title].append(d)
|
|
||||||
articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
|
|
||||||
articles[title][max_articles_per_feed:] = []
|
|
||||||
for item in articles[title]:
|
|
||||||
item.pop('timestamp')
|
|
||||||
if not articles[title]:
|
|
||||||
articles.pop(title)
|
|
||||||
return articles
|
|
||||||
|
|
||||||
|
|
||||||
def build_index(title, articles, dir):
|
|
||||||
'''Build an RSS based index.html'''
|
|
||||||
|
|
||||||
def build_sub_index(title, items):
|
|
||||||
ilist = ''
|
|
||||||
li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
|
|
||||||
u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
|
|
||||||
for item in items:
|
|
||||||
ilist += li%item
|
|
||||||
return u'''\
|
|
||||||
<html>
|
|
||||||
<body>
|
|
||||||
<h2>%(title)s</h2>
|
|
||||||
<ul>
|
|
||||||
%(items)s
|
|
||||||
</ul>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
'''%dict(title=title, items=ilist.rstrip())
|
|
||||||
|
|
||||||
cnum = 0
|
|
||||||
clist = ''
|
|
||||||
categories = articles.keys()
|
|
||||||
categories.sort()
|
|
||||||
for category in categories:
|
|
||||||
cnum += 1
|
|
||||||
cfile = os.path.join(dir, 'category'+str(cnum)+'.html')
|
|
||||||
prefix = 'file:' if iswindows else ''
|
|
||||||
clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
|
|
||||||
src = build_sub_index(category, articles[category])
|
|
||||||
open(cfile, 'wb').write(src.encode('utf-8'))
|
|
||||||
|
|
||||||
src = '''\
|
|
||||||
<html>
|
|
||||||
<body>
|
|
||||||
<h1>%(title)s</h1>
|
|
||||||
<div style='text-align: right; font-weight: bold'>%(date)s</div>
|
|
||||||
<ul>
|
|
||||||
%(categories)s
|
|
||||||
</ul>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
'''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()),
|
|
||||||
categories=clist, title=title)
|
|
||||||
index = os.path.join(dir, 'index.html')
|
|
||||||
open(index, 'wb').write(src.encode('utf-8'))
|
|
||||||
return index
|
|
||||||
|
@ -1,53 +0,0 @@
|
|||||||
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
|
|
||||||
## This program is free software; you can redistribute it and/or modify
|
|
||||||
## it under the terms of the GNU General Public License as published by
|
|
||||||
## the Free Software Foundation; either version 2 of the License, or
|
|
||||||
## (at your option) any later version.
|
|
||||||
##
|
|
||||||
## This program is distributed in the hope that it will be useful,
|
|
||||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
## GNU General Public License for more details.
|
|
||||||
##
|
|
||||||
## You should have received a copy of the GNU General Public License along
|
|
||||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
|
|
||||||
import tempfile, shutil, os
|
|
||||||
from libprs500.ebooks.lrf.web import build_index, parse_feeds
|
|
||||||
|
|
||||||
RSS = 'http://news.bbc.co.uk/1/hi/help/3223484.stm'
|
|
||||||
|
|
||||||
from libprs500 import __appname__, iswindows, browser
|
|
||||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
|
||||||
|
|
||||||
|
|
||||||
def get_feeds(browser):
|
|
||||||
src = browser.open(RSS).read()
|
|
||||||
soup = BeautifulSoup(src[src.index('<html'):])
|
|
||||||
feeds = []
|
|
||||||
ul = soup.find('ul', attrs={'class':'rss'})
|
|
||||||
for link in ul.findAll('a'):
|
|
||||||
feeds.append((link.string, link['href']))
|
|
||||||
return feeds
|
|
||||||
|
|
||||||
def initialize(profile):
|
|
||||||
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
|
|
||||||
profile['browser'] = browser()
|
|
||||||
feeds = get_feeds(profile['browser'])
|
|
||||||
articles = parse_feeds(feeds, profile['browser'], lambda x: x.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/'))
|
|
||||||
index = build_index('The BBC', articles, profile['temp dir'])
|
|
||||||
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
|
|
||||||
profile['timefmt'] = ' [%a, %d %b, %Y]'
|
|
||||||
profile['max_recursions'] = 2
|
|
||||||
profile['title'] = 'The BBC'
|
|
||||||
profile['no_stylesheets'] = True
|
|
||||||
|
|
||||||
def finalize(profile):
|
|
||||||
if os.path.isdir(profile['temp dir']):
|
|
||||||
shutil.rmtree(profile['temp dir'])
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -14,43 +14,48 @@
|
|||||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
'''Convert known websites into LRF files.'''
|
'''Convert known websites into LRF files.'''
|
||||||
|
|
||||||
import sys, time, tempfile, shutil, os, logging
|
import sys, time, tempfile, shutil, os, logging, imp, inspect
|
||||||
from urlparse import urlsplit
|
from urlparse import urlsplit
|
||||||
|
|
||||||
from libprs500 import __appname__, setup_cli_handlers, CommandLineError
|
from libprs500 import __appname__, setup_cli_handlers, CommandLineError
|
||||||
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
|
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
|
||||||
from libprs500.ebooks.lrf.html.convert_from import process_file
|
from libprs500.ebooks.lrf.html.convert_from import process_file
|
||||||
from libprs500.ebooks.lrf.web.profiles import profiles
|
|
||||||
from libprs500.web.fetch.simple import create_fetcher
|
from libprs500.web.fetch.simple import create_fetcher
|
||||||
|
|
||||||
available_profiles = profiles.keys()
|
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
||||||
available_profiles.remove('default')
|
from libprs500.ebooks.lrf.web.profiles.nytimes import NYTimes
|
||||||
available_profiles = ' '.join(available_profiles)
|
from libprs500.ebooks.lrf.web.profiles.bbc import BBC
|
||||||
|
from libprs500.ebooks.lrf.web.profiles.newsweek import Newsweek
|
||||||
|
|
||||||
|
builtin_profiles = [NYTimes, BBC, Newsweek]
|
||||||
|
available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles]
|
||||||
|
|
||||||
def option_parser():
|
def option_parser():
|
||||||
parser = lrf_option_parser(usage='''%prog [options] website_profile\n\n'''
|
parser = lrf_option_parser(usage='''%prog [options] website_profile\n\n'''
|
||||||
'''%prog downloads a site from the web and converts it '''
|
'''%prog downloads a site from the web and converts it '''
|
||||||
'''into a LRF file for use with the SONY Reader. '''
|
'''into a LRF file for use with the SONY Reader. '''
|
||||||
'''website_profile is one of '''+available_profiles+\
|
'''website_profile is one of '''+str(available_profiles)+\
|
||||||
''' If you specify a website_profile of default or do not specify '''
|
''' If you specify a website_profile of default or do not specify '''
|
||||||
'''it, you must specify the --url option.'''
|
'''it, you must specify the --url option.'''
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_option('-u', '--url', dest='url', default=None,
|
parser.add_option('-u', '--url', dest='url', default=None,
|
||||||
help='The URL to download. You only need to specify this if you are not specifying a website_profile.')
|
help='The URL to download. You only need to specify this if you are not specifying a website_profile.')
|
||||||
|
parser.add_option('--user-profile', default=None,
|
||||||
|
help='Path to a python file containing a user created profile.')
|
||||||
parser.add_option('--username', dest='username', default=None,
|
parser.add_option('--username', dest='username', default=None,
|
||||||
help='Specify the username to be used while downloading. Only used if the profile supports it.')
|
help='Specify the username to be used while downloading. Only used if the profile supports it.')
|
||||||
parser.add_option('--password', dest='password', default=None,
|
parser.add_option('--password', dest='password', default=None,
|
||||||
help='Specify the password to be used while downloading. Only used if the profile supports it.')
|
help='Specify the password to be used while downloading. Only used if the profile supports it.')
|
||||||
parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %(timeout)s s'%profiles['default'],
|
parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %d s'%DefaultProfile.timeout,
|
||||||
default=None, type='int', dest='timeout')
|
default=None, type='int', dest='timeout')
|
||||||
parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %(max_recursions)s'%profiles['default'],
|
parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %d'%DefaultProfile.timeout,
|
||||||
default=None, type='int', dest='max_recursions')
|
default=None, type='int', dest='max_recursions')
|
||||||
parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files',
|
parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files',
|
||||||
help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %(max_files)s'%profiles['default'])
|
help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %d'%DefaultProfile.timeout)
|
||||||
parser.add_option('--delay', default=None, dest='delay', type='int',
|
parser.add_option('--delay', default=None, dest='delay', type='int',
|
||||||
help='Minimum interval in seconds between consecutive fetches. Default is %(delay)s s'%profiles['default'])
|
help='Minimum interval in seconds between consecutive fetches. Default is %d s'%DefaultProfile.timeout)
|
||||||
parser.add_option('--dont-download-stylesheets', action='store_true', default=None,
|
parser.add_option('--dont-download-stylesheets', action='store_true', default=None,
|
||||||
help='Do not download CSS stylesheets.', dest='no_stylesheets')
|
help='Do not download CSS stylesheets.', dest='no_stylesheets')
|
||||||
|
|
||||||
@ -85,45 +90,58 @@ def process_profile(args, options, logger=None):
|
|||||||
level = logging.DEBUG if options.verbose else logging.INFO
|
level = logging.DEBUG if options.verbose else logging.INFO
|
||||||
logger = logging.getLogger('web2lrf')
|
logger = logging.getLogger('web2lrf')
|
||||||
setup_cli_handlers(logger, level)
|
setup_cli_handlers(logger, level)
|
||||||
|
index = -1
|
||||||
|
if options.user_profile is not None:
|
||||||
|
path = os.path.abspath(options.user_profile)
|
||||||
|
name = os.path.splitext(os.path.basename(path))[0]
|
||||||
|
res = imp.find_module(name, [os.path.dirname(path)])
|
||||||
|
module = imp.load_module(name, *res)
|
||||||
|
classes = inspect.getmembers(module,
|
||||||
|
lambda x : inspect.isclass(x) and issubclass(x, DefaultProfile)\
|
||||||
|
and x is not DefaultProfile)
|
||||||
|
if not classes:
|
||||||
|
raise CommandLineError('Invalid user profile '+path)
|
||||||
|
builtin_profiles.append(classes[0][1])
|
||||||
|
available_profiles.append(name)
|
||||||
|
if len(args) < 2:
|
||||||
|
args.append('')
|
||||||
|
args[1] = name
|
||||||
if len(args) == 2:
|
if len(args) == 2:
|
||||||
if not profiles.has_key(args[1]):
|
try:
|
||||||
raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys()))
|
index = available_profiles.index(args[1])
|
||||||
profile = profiles[args[1]] if len(args) == 2 else profiles['default']
|
except ValueError:
|
||||||
profile['username'] = options.username
|
raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], available_profiles))
|
||||||
profile['password'] = options.password
|
profile = DefaultProfile if index == -1 else builtin_profiles[index]
|
||||||
if profile.has_key('initialize'):
|
profile = profile(options.username, options.password)
|
||||||
profile['initialize'](profile)
|
if profile.browser is not None:
|
||||||
if profile.has_key('browser'):
|
options.browser = profile.browser
|
||||||
options.browser = profile['browser']
|
|
||||||
|
|
||||||
for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
|
for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
|
||||||
val = getattr(options, opt)
|
val = getattr(options, opt)
|
||||||
if val is None:
|
if val is None:
|
||||||
setattr(options, opt, profile[opt])
|
setattr(options, opt, getattr(profile, opt))
|
||||||
|
|
||||||
if not options.url:
|
if not options.url:
|
||||||
options.url = profile['url']
|
options.url = profile.url
|
||||||
|
|
||||||
if not options.url:
|
if not options.url:
|
||||||
raise CommandLineError('You must specify the --url option or a profile from one of: %s'%(available_profiles,))
|
raise CommandLineError('You must specify the --url option or a profile from one of: %s'%(available_profiles,))
|
||||||
|
|
||||||
if not options.title:
|
if not options.title:
|
||||||
title = profile['title']
|
title = profile.title
|
||||||
if not title:
|
if not title:
|
||||||
title = urlsplit(options.url).netloc
|
title = urlsplit(options.url).netloc
|
||||||
options.title = title + time.strftime(profile['timefmt'], time.localtime())
|
options.title = title + time.strftime(profile.timefmt, time.localtime())
|
||||||
|
|
||||||
options.match_regexps += profile['match_regexps']
|
options.match_regexps += profile.match_regexps
|
||||||
options.preprocess_regexps = profile['preprocess_regexps']
|
options.preprocess_regexps = profile.preprocess_regexps
|
||||||
options.filter_regexps += profile['filter_regexps']
|
options.filter_regexps += profile.filter_regexps
|
||||||
if len(args) == 2 and args[1] != 'default':
|
if len(args) == 2 and args[1] != 'default':
|
||||||
options.anchor_ids = False
|
options.anchor_ids = False
|
||||||
|
|
||||||
htmlfile, tdir = fetch_website(options, logger)
|
htmlfile, tdir = fetch_website(options, logger)
|
||||||
create_lrf(htmlfile, options, logger)
|
create_lrf(htmlfile, options, logger)
|
||||||
finally:
|
finally:
|
||||||
if profile.has_key('finalize'):
|
|
||||||
profile['finalize'](profile)
|
|
||||||
if tdir and os.path.isdir(tdir):
|
if tdir and os.path.isdir(tdir):
|
||||||
shutil.rmtree(tdir)
|
shutil.rmtree(tdir)
|
||||||
|
|
||||||
|
@ -1,81 +0,0 @@
|
|||||||
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
|
|
||||||
## This program is free software; you can redistribute it and/or modify
|
|
||||||
## it under the terms of the GNU General Public License as published by
|
|
||||||
## the Free Software Foundation; either version 2 of the License, or
|
|
||||||
## (at your option) any later version.
|
|
||||||
##
|
|
||||||
## This program is distributed in the hope that it will be useful,
|
|
||||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
## GNU General Public License for more details.
|
|
||||||
##
|
|
||||||
## You should have received a copy of the GNU General Public License along
|
|
||||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
|
|
||||||
import tempfile, shutil, os
|
|
||||||
from libprs500.ebooks.lrf.web import build_index, parse_feeds
|
|
||||||
|
|
||||||
RSS = 'http://economist.com/rss/'
|
|
||||||
TITLES = [
|
|
||||||
'The world this week',
|
|
||||||
'Letters',
|
|
||||||
'Briefings',
|
|
||||||
'Special reports',
|
|
||||||
'Britain',
|
|
||||||
'Europe',
|
|
||||||
'United States',
|
|
||||||
'The Americas',
|
|
||||||
'Middle East and Africa',
|
|
||||||
'Asia',
|
|
||||||
'International',
|
|
||||||
'Business',
|
|
||||||
'Finance and economics',
|
|
||||||
'Science and technology',
|
|
||||||
'Books and arts',
|
|
||||||
'Indicators'
|
|
||||||
]
|
|
||||||
|
|
||||||
from libprs500 import __appname__, iswindows, browser
|
|
||||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
|
||||||
|
|
||||||
def print_version(url):
|
|
||||||
return url.replace('displaystory', 'PrinterFriendly').replace('&fsrc=RSS', '')
|
|
||||||
|
|
||||||
def get_feeds(browser):
|
|
||||||
src = browser.open(RSS).read()
|
|
||||||
soup = BeautifulSoup(src)
|
|
||||||
feeds = []
|
|
||||||
for ul in soup.findAll('ul'):
|
|
||||||
lis = ul.findAll('li')
|
|
||||||
try:
|
|
||||||
title, link = lis[0], lis[1]
|
|
||||||
except IndexError:
|
|
||||||
continue
|
|
||||||
title = title.string
|
|
||||||
if title:
|
|
||||||
title = title.strip()
|
|
||||||
if title not in TITLES:
|
|
||||||
continue
|
|
||||||
a = link.find('a')
|
|
||||||
feeds.append((title, a['href'].strip()))
|
|
||||||
|
|
||||||
return feeds
|
|
||||||
|
|
||||||
def initialize(profile):
|
|
||||||
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
|
|
||||||
profile['browser'] = browser()
|
|
||||||
feeds = get_feeds(profile['browser'])
|
|
||||||
articles = parse_feeds(feeds, profile['browser'], print_version, max_articles_per_feed=20)
|
|
||||||
index = build_index('The Economist', articles, profile['temp dir'])
|
|
||||||
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
|
|
||||||
profile['timefmt'] = ' [%d %b %Y]'
|
|
||||||
profile['max_recursions'] = 3
|
|
||||||
profile['title'] = 'The Economist'
|
|
||||||
profile.pop('browser') # Needed as for some reason using the same browser instance causes timeouts
|
|
||||||
|
|
||||||
def finalize(profile):
|
|
||||||
if os.path.isdir(profile['temp dir']):
|
|
||||||
shutil.rmtree(profile['temp dir'])
|
|
||||||
|
|
@ -1,73 +0,0 @@
|
|||||||
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
|
|
||||||
## This program is free software; you can redistribute it and/or modify
|
|
||||||
## it under the terms of the GNU General Public License as published by
|
|
||||||
## the Free Software Foundation; either version 2 of the License, or
|
|
||||||
## (at your option) any later version.
|
|
||||||
##
|
|
||||||
## This program is distributed in the hope that it will be useful,
|
|
||||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
## GNU General Public License for more details.
|
|
||||||
##
|
|
||||||
## You should have received a copy of the GNU General Public License along
|
|
||||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
'''New York Times from RSS feeds.'''
|
|
||||||
import os, tempfile, shutil
|
|
||||||
|
|
||||||
from libprs500 import __appname__, iswindows, browser
|
|
||||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
|
||||||
from libprs500.ebooks.lrf.web import build_index, parse_feeds
|
|
||||||
|
|
||||||
RSS = 'http://www.nytimes.com/services/xml/rss/index.html'
|
|
||||||
LOGIN = 'http://www.nytimes.com/auth/login'
|
|
||||||
|
|
||||||
def get_feeds(browser):
|
|
||||||
src = browser.open(RSS).read()
|
|
||||||
soup = BeautifulSoup(src[src.index('<html'):])
|
|
||||||
feeds = []
|
|
||||||
for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
|
|
||||||
if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts',
|
|
||||||
'Dining & Wine', 'Home & Garden', 'Multimedia',
|
|
||||||
'Most E-mailed Articles',
|
|
||||||
'Automobiles', 'Fashion & Style', 'Television News',
|
|
||||||
'Education']:
|
|
||||||
feeds.append((link['title'], link['href']))
|
|
||||||
#else: print link['title']
|
|
||||||
|
|
||||||
return feeds
|
|
||||||
|
|
||||||
def initialize(profile):
|
|
||||||
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
|
|
||||||
profile['browser'] = login(profile)
|
|
||||||
feeds = get_feeds(profile['browser'])
|
|
||||||
articles = parse_feeds(feeds, profile['browser'], lambda x: x + '?&pagewanted=print',
|
|
||||||
oldest_article=2)
|
|
||||||
index = build_index('The New York Times', articles, profile['temp dir'])
|
|
||||||
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
|
|
||||||
profile['timefmt'] = ' [%a, %d %b, %Y]'
|
|
||||||
profile['max_recursions'] = 2
|
|
||||||
profile['title'] = 'The New York Times'
|
|
||||||
|
|
||||||
|
|
||||||
def finalize(profile):
|
|
||||||
if os.path.isdir(profile['temp dir']):
|
|
||||||
shutil.rmtree(profile['temp dir'])
|
|
||||||
|
|
||||||
|
|
||||||
def login(profile):
|
|
||||||
br = browser()
|
|
||||||
if profile['username'] and profile['password']:
|
|
||||||
br.open(LOGIN)
|
|
||||||
br.select_form(name='login')
|
|
||||||
br['USERID'] = profile['username']
|
|
||||||
br['PASSWORD'] = profile['password']
|
|
||||||
br.submit()
|
|
||||||
return br
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
feeds = get_feeds()
|
|
||||||
articles = parse_feeds(feeds)
|
|
||||||
print articles
|
|
||||||
|
|
@ -1,136 +0,0 @@
|
|||||||
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
|
|
||||||
## This program is free software; you can redistribute it and/or modify
|
|
||||||
## it under the terms of the GNU General Public License as published by
|
|
||||||
## the Free Software Foundation; either version 2 of the License, or
|
|
||||||
## (at your option) any later version.
|
|
||||||
##
|
|
||||||
## This program is distributed in the hope that it will be useful,
|
|
||||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
## GNU General Public License for more details.
|
|
||||||
##
|
|
||||||
## You should have received a copy of the GNU General Public License along
|
|
||||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
'''Profiles for known websites.'''
|
|
||||||
import re
|
|
||||||
|
|
||||||
from libprs500.ebooks.lrf.web.newsweek import initialize as newsweek_initialize
|
|
||||||
from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize
|
|
||||||
from libprs500.ebooks.lrf.web.nytimes import initialize as nytimes_initialize
|
|
||||||
from libprs500.ebooks.lrf.web.nytimes import finalize as nytimes_finalize
|
|
||||||
from libprs500.ebooks.lrf.web.bbc import initialize as bbc_initialize
|
|
||||||
from libprs500.ebooks.lrf.web.bbc import finalize as bbc_finalize
|
|
||||||
from libprs500.ebooks.lrf.web.economist import initialize as economist_initialize
|
|
||||||
from libprs500.ebooks.lrf.web.economist import finalize as economist_finalize
|
|
||||||
|
|
||||||
|
|
||||||
profiles = {
|
|
||||||
'default' : {
|
|
||||||
'url' : '', # The URL of the website
|
|
||||||
'title' : '', # The title to use for the LRF file
|
|
||||||
'max_recursions' : 1, # Number of levels of links to follow
|
|
||||||
'max_files' : 1000, # Maximum number of files to download
|
|
||||||
'delay' : 0, # Delay between consecutive downloads
|
|
||||||
'timeout' : 10, # Timeout for fetching files from server in seconds
|
|
||||||
'timefmt' : ' [%a %d %b %Y]',
|
|
||||||
'no_stylesheets' : False, # Download stylesheets
|
|
||||||
'match_regexps' : [], # List of regular expressions that determines which links to follow
|
|
||||||
'filter_regexps' : [], # List of regular expressions that determines which links to ignore
|
|
||||||
# Only one of match_regexps or filter_regexps should be defined
|
|
||||||
'html2lrf_options' : [], # List of options to pass to html2lrf
|
|
||||||
'preprocess_regexps': [], # List of regexp substitution rules to run on the downloaded HTML before running html2lrf
|
|
||||||
# See the profiles below for examples of these settings.
|
|
||||||
},
|
|
||||||
|
|
||||||
'nytimes' : {
|
|
||||||
'initialize' : nytimes_initialize,
|
|
||||||
'finalize' : nytimes_finalize,
|
|
||||||
|
|
||||||
'preprocess_regexps' :
|
|
||||||
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
|
||||||
[
|
|
||||||
# Remove header bar
|
|
||||||
(r'(<body.*?>).*?<h1', lambda match: match.group(1)+'<h1'),
|
|
||||||
(r'<div class="articleTools">.*></ul>', lambda match : ''),
|
|
||||||
# Remove footer bar
|
|
||||||
(r'<\!-- end \#article -->.*', lambda match : '</body></html>'),
|
|
||||||
(r'<div id="footer">.*', lambda match : '</body></html>'),
|
|
||||||
]
|
|
||||||
],
|
|
||||||
},
|
|
||||||
|
|
||||||
'bbc' : {
|
|
||||||
'initialize' : bbc_initialize,
|
|
||||||
'finalize' : bbc_finalize,
|
|
||||||
'preprocess_regexps' :
|
|
||||||
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
|
||||||
[
|
|
||||||
# Remove footer from individual stories
|
|
||||||
(r'<div class=.footer.>.*?Published',
|
|
||||||
lambda match : '<p></p><div class="footer">Published'),
|
|
||||||
# Add some style info in place of disabled stylesheet
|
|
||||||
(r'<link.*?type=.text/css.*?>', lambda match :
|
|
||||||
'''<style type="text/css">
|
|
||||||
.headline {font-size: x-large;}
|
|
||||||
.fact { padding-top: 10pt }
|
|
||||||
</style>'''),
|
|
||||||
]
|
|
||||||
],
|
|
||||||
},
|
|
||||||
|
|
||||||
'newsweek' : {
|
|
||||||
'initialize' : newsweek_initialize,
|
|
||||||
'finalize' : newsweek_finalize,
|
|
||||||
'preprocess_regexps' :
|
|
||||||
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
|
||||||
[
|
|
||||||
# Make fonts larger
|
|
||||||
(r'<style.*?\.copyright.*?</style>',
|
|
||||||
lambda match : \
|
|
||||||
'''<style type="text/css">'''
|
|
||||||
'''updateTime{font:small Arial;color:#000000;}'''
|
|
||||||
'''.credit{font:small Arial;color:#999999;}'''
|
|
||||||
'''.head{font:bold 18pt x-large;color:#CC0000;}'''
|
|
||||||
'''.abstract{font:14pt large Verdana;color:#000000;}'''
|
|
||||||
'''.title{font:bold;color:#000000;}'''
|
|
||||||
'''.source{font:bold small Verdana;color:#CC0000;}'''
|
|
||||||
'''.footerLink{font:bold Verdana;color:#000000;}'''
|
|
||||||
'''.caption{font: Verdana;color:#000000;}'''
|
|
||||||
'''.textBodyBlack, .copyright{font: Verdana;color:#000000;}'''
|
|
||||||
'''.copyright{font-style:italic;}'''
|
|
||||||
'''</style>'''
|
|
||||||
),
|
|
||||||
]
|
|
||||||
],
|
|
||||||
},
|
|
||||||
|
|
||||||
'economist' : {
|
|
||||||
'initialize' : economist_initialize,
|
|
||||||
'finalize' : economist_finalize,
|
|
||||||
'preprocess_regexps' :
|
|
||||||
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
|
||||||
[
|
|
||||||
# Remove advert
|
|
||||||
(r'<noscript.*?</noscript>', lambda match: ''),
|
|
||||||
]
|
|
||||||
],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for key in profiles.keys():
|
|
||||||
if key == 'default':
|
|
||||||
continue
|
|
||||||
newd = profiles['default'].copy()
|
|
||||||
newd.update(profiles[key])
|
|
||||||
profiles[key] = newd
|
|
||||||
|
|
||||||
def profile_to_command_line_options(profile):
|
|
||||||
args = []
|
|
||||||
args.append('--max-recursions='+str(profile['max_recursions']))
|
|
||||||
args.append('--delay='+str(profile['delay']))
|
|
||||||
for i in profile['match_regexps']:
|
|
||||||
args.append('--match-regexp="'+i+'"')
|
|
||||||
for i in profile['filter_regexps']:
|
|
||||||
args.append('--filter-regexp="'+i+'"')
|
|
||||||
return args
|
|
227
src/libprs500/ebooks/lrf/web/profiles/__init__.py
Normal file
227
src/libprs500/ebooks/lrf/web/profiles/__init__.py
Normal file
@ -0,0 +1,227 @@
|
|||||||
|
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
|
||||||
|
## This program is free software; you can redistribute it and/or modify
|
||||||
|
## it under the terms of the GNU General Public License as published by
|
||||||
|
## the Free Software Foundation; either version 2 of the License, or
|
||||||
|
## (at your option) any later version.
|
||||||
|
##
|
||||||
|
## This program is distributed in the hope that it will be useful,
|
||||||
|
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
## GNU General Public License for more details.
|
||||||
|
##
|
||||||
|
## You should have received a copy of the GNU General Public License along
|
||||||
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
'''
|
||||||
|
'''
|
||||||
|
|
||||||
|
import tempfile, time, calendar, re, operator
|
||||||
|
from htmlentitydefs import name2codepoint
|
||||||
|
|
||||||
|
from libprs500 import __appname__, iswindows, browser
|
||||||
|
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||||
|
|
||||||
|
|
||||||
|
class DefaultProfile(object):
|
||||||
|
|
||||||
|
url = '' # The URL of the website
|
||||||
|
title = 'Default Profile' # The title to use for the LRF file
|
||||||
|
max_articles_per_feed = 10 # Maximum number of articles to download from each feed
|
||||||
|
html_description = False # If True process the <description> element of the feed as HTML
|
||||||
|
oldest_article = 7 # How many days old should the oldest article downloaded from the feeds be?
|
||||||
|
max_recursions = 1 # Number of levels of links to follow
|
||||||
|
max_files = 3000 # Maximum number of files to download
|
||||||
|
delay = 0 # Delay between consecutive downloads
|
||||||
|
timeout = 10 # Timeout for fetching files from server in seconds
|
||||||
|
timefmt = ' [%a %d %b %Y]' # The format of the date shown on the first page
|
||||||
|
no_stylesheets = False # Download stylesheets only if False
|
||||||
|
match_regexps = [] # List of regular expressions that determines which links to follow
|
||||||
|
filter_regexps = [] # List of regular expressions that determines which links to ignore
|
||||||
|
# Only one of match_regexps or filter_regexps should be defined
|
||||||
|
|
||||||
|
html2lrf_options = [] # List of options to pass to html2lrf
|
||||||
|
# List of regexp substitution rules to run on the downloaded HTML. Each element of the
|
||||||
|
# list should be a two element tuple. THe first element of the tuple should
|
||||||
|
# be a compiled regular expression and the second a callable that takes
|
||||||
|
# a single match object and returns a string to replace the match.
|
||||||
|
preprocess_regexps = []
|
||||||
|
|
||||||
|
# See the built-in profiles for examples of these settings.
|
||||||
|
|
||||||
|
def get_feeds(self):
|
||||||
|
'''
|
||||||
|
Return a list of RSS feeds to fetch for this profile. Each element of the list
|
||||||
|
must be a 2-element tuple of the form (title, url).
|
||||||
|
'''
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def print_version(cls, url):
|
||||||
|
'''
|
||||||
|
Takea a URL pointing to an article and returns the URL pointing to the
|
||||||
|
print version of the article.
|
||||||
|
'''
|
||||||
|
return url
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_browser(cls):
|
||||||
|
'''
|
||||||
|
Return a browser instance used to fetch documents from the web.
|
||||||
|
|
||||||
|
If your profile requires that you login first, override this method
|
||||||
|
in your subclass. See for example the nytimes profile.
|
||||||
|
'''
|
||||||
|
return browser()
|
||||||
|
|
||||||
|
########################################################################
|
||||||
|
###################### End of customizable portion #####################
|
||||||
|
########################################################################
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, username=None, password=None):
|
||||||
|
self.username = username
|
||||||
|
self.password = password
|
||||||
|
self.temp_dir = tempfile.mkdtemp(prefix=__appname__+'_')
|
||||||
|
self.browser = self.get_browser()
|
||||||
|
self.url = 'file:'+ ('' if iswindows else '//') + self.build_index()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
import os, shutil
|
||||||
|
if os.path.isdir(self.temp_dir):
|
||||||
|
shutil.rmtree(self.temp_dir)
|
||||||
|
|
||||||
|
def build_index(self):
|
||||||
|
'''Build an RSS based index.html'''
|
||||||
|
import os
|
||||||
|
articles = self.parse_feeds()
|
||||||
|
|
||||||
|
|
||||||
|
def build_sub_index(title, items):
|
||||||
|
ilist = ''
|
||||||
|
li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
|
||||||
|
u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
|
||||||
|
for item in items:
|
||||||
|
ilist += li%item
|
||||||
|
return u'''\
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h2>%(title)s</h2>
|
||||||
|
<ul>
|
||||||
|
%(items)s
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
'''%dict(title=title, items=ilist.rstrip())
|
||||||
|
|
||||||
|
cnum = 0
|
||||||
|
clist = ''
|
||||||
|
categories = articles.keys()
|
||||||
|
categories.sort()
|
||||||
|
for category in categories:
|
||||||
|
cnum += 1
|
||||||
|
cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html')
|
||||||
|
prefix = 'file:' if iswindows else ''
|
||||||
|
clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
|
||||||
|
src = build_sub_index(category, articles[category])
|
||||||
|
open(cfile, 'wb').write(src.encode('utf-8'))
|
||||||
|
|
||||||
|
src = '''\
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>%(title)s</h1>
|
||||||
|
<div style='text-align: right; font-weight: bold'>%(date)s</div>
|
||||||
|
<ul>
|
||||||
|
%(categories)s
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
'''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()),
|
||||||
|
categories=clist, title=self.title)
|
||||||
|
index = os.path.join(self.temp_dir, 'index.html')
|
||||||
|
open(index, 'wb').write(src.encode('utf-8'))
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
def parse_feeds(self):
|
||||||
|
feeds = self.get_feeds()
|
||||||
|
articles = {}
|
||||||
|
for title, url in feeds:
|
||||||
|
try:
|
||||||
|
src = self.browser.open(url).read()
|
||||||
|
except Exception, err:
|
||||||
|
print 'Could not fetch feed: %s\nError: %s'%(url, err)
|
||||||
|
continue
|
||||||
|
|
||||||
|
articles[title] = []
|
||||||
|
soup = BeautifulStoneSoup(src)
|
||||||
|
for item in soup.findAll('item'):
|
||||||
|
try:
|
||||||
|
pubdate = item.find('pubdate').string
|
||||||
|
if not pubdate:
|
||||||
|
continue
|
||||||
|
pubdate = pubdate.replace('+0000', 'GMT')
|
||||||
|
d = {
|
||||||
|
'title' : item.find('title').string,
|
||||||
|
'url' : self.print_version(item.find('guid').string),
|
||||||
|
'timestamp': calendar.timegm(self.strptime(pubdate)),
|
||||||
|
'date' : pubdate
|
||||||
|
}
|
||||||
|
delta = time.time() - d['timestamp']
|
||||||
|
if delta > self.oldest_article*3600*24:
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception, err:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
desc = item.find('description')
|
||||||
|
d['description'] = self.process_html_description(desc) if self.html_description else desc.string
|
||||||
|
except:
|
||||||
|
d['description'] = ''
|
||||||
|
articles[title].append(d)
|
||||||
|
articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
|
||||||
|
articles[title][self.max_articles_per_feed:] = []
|
||||||
|
for item in articles[title]:
|
||||||
|
item.pop('timestamp')
|
||||||
|
if not articles[title]:
|
||||||
|
articles.pop(title)
|
||||||
|
return articles
|
||||||
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def process_html_description(cls, tag):
|
||||||
|
src = '\n'.join(tag.contents)
|
||||||
|
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
|
||||||
|
for e in replaced_entities:
|
||||||
|
ent = '&'+e+';'
|
||||||
|
src = src.replace(ent, unichr(name2codepoint[e]))
|
||||||
|
return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
|
||||||
|
|
||||||
|
|
||||||
|
DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
|
||||||
|
MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12)
|
||||||
|
FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6,
|
||||||
|
July=7, August=8, September=9, October=10,
|
||||||
|
November=11, December=12)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def strptime(cls, src):
|
||||||
|
src = src.strip().split()
|
||||||
|
src[0] = str(cls.DAY_MAP[src[0][:-1]])+','
|
||||||
|
try:
|
||||||
|
src[2] = str(cls.MONTH_MAP[src[2]])
|
||||||
|
except KeyError:
|
||||||
|
src[2] = str(cls.FULL_MONTH_MAP[src[2]])
|
||||||
|
return time.strptime(' '.join(src), '%w, %d %m %Y %H:%M:%S %Z')
|
||||||
|
|
||||||
|
def command_line_options(self):
|
||||||
|
args = []
|
||||||
|
args.append('--max-recursions='+str(self.max_recursions))
|
||||||
|
args.append('--delay='+str(self.delay))
|
||||||
|
args.append('--max-files='+str(self.max_files))
|
||||||
|
for i in self.match_regexps:
|
||||||
|
args.append('--match-regexp="'+i+'"')
|
||||||
|
for i in self.filter_regexps:
|
||||||
|
args.append('--filter-regexp="'+i+'"')
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
57
src/libprs500/ebooks/lrf/web/profiles/bbc.py
Normal file
57
src/libprs500/ebooks/lrf/web/profiles/bbc.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
|
||||||
|
## This program is free software; you can redistribute it and/or modify
|
||||||
|
## it under the terms of the GNU General Public License as published by
|
||||||
|
## the Free Software Foundation; either version 2 of the License, or
|
||||||
|
## (at your option) any later version.
|
||||||
|
##
|
||||||
|
## This program is distributed in the hope that it will be useful,
|
||||||
|
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
## GNU General Public License for more details.
|
||||||
|
##
|
||||||
|
## You should have received a copy of the GNU General Public License along
|
||||||
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
'''
|
||||||
|
Fetch the BBC.
|
||||||
|
'''
|
||||||
|
import re
|
||||||
|
|
||||||
|
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
||||||
|
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
class BBC(DefaultProfile):
|
||||||
|
|
||||||
|
title = 'The BBC'
|
||||||
|
max_recursions = 2
|
||||||
|
timefmt = ' [%a, %d %b, %Y]'
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
preprocess_regexps = \
|
||||||
|
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||||
|
[
|
||||||
|
# Remove footer from individual stories
|
||||||
|
(r'<div class=.footer.>.*?Published',
|
||||||
|
lambda match : '<p></p><div class="footer">Published'),
|
||||||
|
# Add some style info in place of disabled stylesheet
|
||||||
|
(r'<link.*?type=.text/css.*?>', lambda match :
|
||||||
|
'''<style type="text/css">
|
||||||
|
.headline {font-size: x-large;}
|
||||||
|
.fact { padding-top: 10pt }
|
||||||
|
</style>'''),
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/')
|
||||||
|
|
||||||
|
def get_feeds(self):
|
||||||
|
src = self.browser.open('http://news.bbc.co.uk/1/hi/help/3223484.stm').read()
|
||||||
|
soup = BeautifulSoup(src[src.index('<html'):])
|
||||||
|
feeds = []
|
||||||
|
ul = soup.find('ul', attrs={'class':'rss'})
|
||||||
|
for link in ul.findAll('a'):
|
||||||
|
feeds.append((link.string, link['href']))
|
||||||
|
return feeds
|
||||||
|
|
81
src/libprs500/ebooks/lrf/web/profiles/economist.py
Normal file
81
src/libprs500/ebooks/lrf/web/profiles/economist.py
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
|
||||||
|
## This program is free software; you can redistribute it and/or modify
|
||||||
|
## it under the terms of the GNU General Public License as published by
|
||||||
|
## the Free Software Foundation; either version 2 of the License, or
|
||||||
|
## (at your option) any later version.
|
||||||
|
##
|
||||||
|
## This program is distributed in the hope that it will be useful,
|
||||||
|
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
## GNU General Public License for more details.
|
||||||
|
##
|
||||||
|
## You should have received a copy of the GNU General Public License along
|
||||||
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
'''
|
||||||
|
Fetch The Economist.
|
||||||
|
'''
|
||||||
|
import re
|
||||||
|
|
||||||
|
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
||||||
|
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
class Economist(DefaultProfile):
|
||||||
|
|
||||||
|
title = 'The Economist'
|
||||||
|
timefmt = ' [%d %b %Y]'
|
||||||
|
max_recursions = 3
|
||||||
|
|
||||||
|
TITLES = [
|
||||||
|
'The world this week',
|
||||||
|
'Letters',
|
||||||
|
'Briefings',
|
||||||
|
'Special reports',
|
||||||
|
'Britain',
|
||||||
|
'Europe',
|
||||||
|
'United States',
|
||||||
|
'The Americas',
|
||||||
|
'Middle East and Africa',
|
||||||
|
'Asia',
|
||||||
|
'International',
|
||||||
|
'Business',
|
||||||
|
'Finance and economics',
|
||||||
|
'Science and technology',
|
||||||
|
'Books and arts',
|
||||||
|
'Indicators'
|
||||||
|
]
|
||||||
|
|
||||||
|
preprocess_regexps = \
|
||||||
|
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||||
|
[
|
||||||
|
# Remove advert
|
||||||
|
(r'<noscript.*?</noscript>', lambda match: ''),
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(self, username=None, password=None):
|
||||||
|
DefaultProfile.__init__(self, username, password)
|
||||||
|
self.browser = None # Needed as otherwise there are timeouts while fetching actual articles
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('displaystory', 'PrinterFriendly').replace('&fsrc=RSS', '')
|
||||||
|
|
||||||
|
def get_feeds(self):
|
||||||
|
src = self.browser.open('http://economist.com/rss/').read()
|
||||||
|
soup = BeautifulSoup(src)
|
||||||
|
feeds = []
|
||||||
|
for ul in soup.findAll('ul'):
|
||||||
|
lis = ul.findAll('li')
|
||||||
|
try:
|
||||||
|
title, link = lis[0], lis[1]
|
||||||
|
except IndexError:
|
||||||
|
continue
|
||||||
|
title = title.string
|
||||||
|
if title:
|
||||||
|
title = title.strip()
|
||||||
|
if title not in self.__class__.TITLES:
|
||||||
|
continue
|
||||||
|
a = link.find('a')
|
||||||
|
feeds.append((title, a['href'].strip()))
|
||||||
|
|
||||||
|
return feeds
|
@ -12,14 +12,27 @@
|
|||||||
## You should have received a copy of the GNU General Public License along
|
## You should have received a copy of the GNU General Public License along
|
||||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
'''Logic to create a Newsweek HTML aggregator from RSS feeds'''
|
'''
|
||||||
|
Profile to download Newsweek
|
||||||
|
'''
|
||||||
|
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
||||||
|
|
||||||
import tempfile, os, shutil
|
class Newsweek(DefaultProfile):
|
||||||
|
|
||||||
from libprs500.ebooks.lrf.web import build_index, parse_feeds
|
title = 'Newsweek'
|
||||||
from libprs500 import __appname__, iswindows, browser
|
max_recursions = 2
|
||||||
|
timefmt = ' [%d %b %Y]'
|
||||||
|
html_description = True
|
||||||
|
oldest_article = 15
|
||||||
|
|
||||||
RSS_FEEDS = [
|
|
||||||
|
def print_version(self, url):
|
||||||
|
if not url.endswith('/'):
|
||||||
|
url += '/'
|
||||||
|
return url + 'output/print'
|
||||||
|
|
||||||
|
def get_feeds(self):
|
||||||
|
return [
|
||||||
('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
|
('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
|
||||||
('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'),
|
('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'),
|
||||||
('Politics', 'http://feeds.newsweek.com/headlines/politics'),
|
('Politics', 'http://feeds.newsweek.com/headlines/politics'),
|
||||||
@ -34,26 +47,3 @@ RSS_FEEDS = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def print_version(url):
|
|
||||||
if '?' in url:
|
|
||||||
url = url[:url.index('?')]
|
|
||||||
if not url.endswith('/'):
|
|
||||||
url += '/'
|
|
||||||
return url + 'output/print'
|
|
||||||
|
|
||||||
def initialize(profile):
|
|
||||||
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
|
|
||||||
profile['browser'] = browser()
|
|
||||||
articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version,
|
|
||||||
max_articles_per_feed=20, oldest_article=15,
|
|
||||||
html_description=True)
|
|
||||||
index = build_index('Newsweek', articles, profile['temp dir'])
|
|
||||||
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
|
|
||||||
profile['timefmt'] = ' [%d %b %Y]'
|
|
||||||
profile['max_recursions'] = 2
|
|
||||||
profile['title'] = 'Newsweek'
|
|
||||||
profile['url'] = 'file:'+ ('' if iswindows else '//') +index
|
|
||||||
|
|
||||||
def finalize(profile):
|
|
||||||
if os.path.isdir(profile['temp dir']):
|
|
||||||
shutil.rmtree(profile['temp dir'])
|
|
66
src/libprs500/ebooks/lrf/web/profiles/nytimes.py
Normal file
66
src/libprs500/ebooks/lrf/web/profiles/nytimes.py
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
|
||||||
|
## This program is free software; you can redistribute it and/or modify
|
||||||
|
## it under the terms of the GNU General Public License as published by
|
||||||
|
## the Free Software Foundation; either version 2 of the License, or
|
||||||
|
## (at your option) any later version.
|
||||||
|
##
|
||||||
|
## This program is distributed in the hope that it will be useful,
|
||||||
|
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
## GNU General Public License for more details.
|
||||||
|
##
|
||||||
|
## You should have received a copy of the GNU General Public License along
|
||||||
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
'''
|
||||||
|
Profile to download the New York Times
|
||||||
|
'''
|
||||||
|
import re
|
||||||
|
|
||||||
|
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
||||||
|
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
class NYTimes(DefaultProfile):
|
||||||
|
|
||||||
|
title = 'The New York Times'
|
||||||
|
timefmt = ' [%a, %d %b, %Y]'
|
||||||
|
max_recursions = 2
|
||||||
|
|
||||||
|
preprocess_regexps = \
|
||||||
|
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||||
|
[
|
||||||
|
# Remove header bar
|
||||||
|
(r'(<body.*?>).*?<h1', lambda match: match.group(1)+'<h1'),
|
||||||
|
(r'<div class="articleTools">.*></ul>', lambda match : ''),
|
||||||
|
# Remove footer bar
|
||||||
|
(r'<\!-- end \#article -->.*', lambda match : '</body></html>'),
|
||||||
|
(r'<div id="footer">.*', lambda match : '</body></html>'),
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
def browser(self):
|
||||||
|
br = DefaultProfile.browser(self)
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
br.open('http://www.nytimes.com/auth/login')
|
||||||
|
br.select_form(name='login')
|
||||||
|
br['USERID'] = self.username
|
||||||
|
br['PASSWORD'] = self.password
|
||||||
|
br.submit()
|
||||||
|
return br
|
||||||
|
|
||||||
|
def get_feeds(self):
|
||||||
|
src = self.browser.open('http://www.nytimes.com/services/xml/rss/index.html').read()
|
||||||
|
soup = BeautifulSoup(src[src.index('<html'):])
|
||||||
|
feeds = []
|
||||||
|
for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
|
||||||
|
if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts',
|
||||||
|
'Dining & Wine', 'Home & Garden', 'Multimedia',
|
||||||
|
'Most E-mailed Articles',
|
||||||
|
'Automobiles', 'Fashion & Style', 'Television News',
|
||||||
|
'Education']:
|
||||||
|
feeds.append((link['title'], link['href']))
|
||||||
|
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url + '?&pagewanted=print'
|
Loading…
x
Reference in New Issue
Block a user