Add support for user profiles in web2lrf

This commit is contained in:
Kovid Goyal 2007-10-27 00:34:58 +00:00
parent 275b59a2e7
commit 8799a6f3f2
11 changed files with 500 additions and 535 deletions

View File

@ -13,134 +13,3 @@
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import os, time, calendar, operator, re
from libprs500 import iswindows
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
from htmlentitydefs import name2codepoint
DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12)
FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6,
July=7, August=8, September=9, October=10,
November=11, December=12)
def strptime(src):
src = src.strip().split()
src[0] = str(DAY_MAP[src[0][:-1]])+','
try:
src[2] = str(MONTH_MAP[src[2]])
except KeyError:
src[2] = str(FULL_MONTH_MAP[src[2]])
return time.strptime(' '.join(src), '%w, %d %m %Y %H:%M:%S %Z')
def process_html_description(tag):
src = '\n'.join(tag.contents)
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
for e in replaced_entities:
ent = '&'+e+';'
src = src.replace(ent, unichr(name2codepoint[e]))
return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
def parse_feeds(feeds, browser, print_version,
max_articles_per_feed=10,
html_description=False,
oldest_article=7):
'''
@param print_version: Callable that takes a url string and returns the url to
printable version of the article pointed to by the original url.
@param max_articles_per_feed: Maximum number of articles to download from each feed
@param html_description: If true the atricles descriptions are processed as HTML
@param oldest_article: A number in days. No articles older than now - oldest_aticle
will be downloaded.
'''
articles = {}
for title, url in feeds:
try:
src = browser.open(url).read()
except Exception, err:
print 'Could not fetch feed: %s\nError: %s'%(url, err)
continue
articles[title] = []
soup = BeautifulStoneSoup(src)
for item in soup.findAll('item'):
try:
pubdate = item.find('pubdate').string
if not pubdate:
continue
pubdate = pubdate.replace('+0000', 'GMT')
d = {
'title' : item.find('title').string,
'url' : print_version(item.find('guid').string),
'timestamp': calendar.timegm(strptime(pubdate)),
'date' : pubdate
}
delta = time.time() - d['timestamp']
if delta > oldest_article*3600*24:
continue
except Exception, err:
continue
try:
desc = item.find('description')
d['description'] = process_html_description(desc) if html_description else desc.string
except:
d['description'] = ''
articles[title].append(d)
articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
articles[title][max_articles_per_feed:] = []
for item in articles[title]:
item.pop('timestamp')
if not articles[title]:
articles.pop(title)
return articles
def build_index(title, articles, dir):
'''Build an RSS based index.html'''
def build_sub_index(title, items):
ilist = ''
li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
for item in items:
ilist += li%item
return u'''\
<html>
<body>
<h2>%(title)s</h2>
<ul>
%(items)s
</ul>
</body>
</html>
'''%dict(title=title, items=ilist.rstrip())
cnum = 0
clist = ''
categories = articles.keys()
categories.sort()
for category in categories:
cnum += 1
cfile = os.path.join(dir, 'category'+str(cnum)+'.html')
prefix = 'file:' if iswindows else ''
clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
src = build_sub_index(category, articles[category])
open(cfile, 'wb').write(src.encode('utf-8'))
src = '''\
<html>
<body>
<h1>%(title)s</h1>
<div style='text-align: right; font-weight: bold'>%(date)s</div>
<ul>
%(categories)s
</ul>
</body>
</html>
'''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()),
categories=clist, title=title)
index = os.path.join(dir, 'index.html')
open(index, 'wb').write(src.encode('utf-8'))
return index

View File

@ -1,53 +0,0 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import tempfile, shutil, os
from libprs500.ebooks.lrf.web import build_index, parse_feeds
RSS = 'http://news.bbc.co.uk/1/hi/help/3223484.stm'
from libprs500 import __appname__, iswindows, browser
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
def get_feeds(browser):
src = browser.open(RSS).read()
soup = BeautifulSoup(src[src.index('<html'):])
feeds = []
ul = soup.find('ul', attrs={'class':'rss'})
for link in ul.findAll('a'):
feeds.append((link.string, link['href']))
return feeds
def initialize(profile):
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
profile['browser'] = browser()
feeds = get_feeds(profile['browser'])
articles = parse_feeds(feeds, profile['browser'], lambda x: x.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/'))
index = build_index('The BBC', articles, profile['temp dir'])
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
profile['timefmt'] = ' [%a, %d %b, %Y]'
profile['max_recursions'] = 2
profile['title'] = 'The BBC'
profile['no_stylesheets'] = True
def finalize(profile):
if os.path.isdir(profile['temp dir']):
shutil.rmtree(profile['temp dir'])

View File

@ -14,43 +14,48 @@
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''Convert known websites into LRF files.'''
import sys, time, tempfile, shutil, os, logging
import sys, time, tempfile, shutil, os, logging, imp, inspect
from urlparse import urlsplit
from libprs500 import __appname__, setup_cli_handlers, CommandLineError
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks.lrf.html.convert_from import process_file
from libprs500.ebooks.lrf.web.profiles import profiles
from libprs500.web.fetch.simple import create_fetcher
available_profiles = profiles.keys()
available_profiles.remove('default')
available_profiles = ' '.join(available_profiles)
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
from libprs500.ebooks.lrf.web.profiles.nytimes import NYTimes
from libprs500.ebooks.lrf.web.profiles.bbc import BBC
from libprs500.ebooks.lrf.web.profiles.newsweek import Newsweek
builtin_profiles = [NYTimes, BBC, Newsweek]
available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles]
def option_parser():
parser = lrf_option_parser(usage='''%prog [options] website_profile\n\n'''
'''%prog downloads a site from the web and converts it '''
'''into a LRF file for use with the SONY Reader. '''
'''website_profile is one of '''+available_profiles+\
'''website_profile is one of '''+str(available_profiles)+\
''' If you specify a website_profile of default or do not specify '''
'''it, you must specify the --url option.'''
)
parser.add_option('-u', '--url', dest='url', default=None,
help='The URL to download. You only need to specify this if you are not specifying a website_profile.')
parser.add_option('--user-profile', default=None,
help='Path to a python file containing a user created profile.')
parser.add_option('--username', dest='username', default=None,
help='Specify the username to be used while downloading. Only used if the profile supports it.')
parser.add_option('--password', dest='password', default=None,
help='Specify the password to be used while downloading. Only used if the profile supports it.')
parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %(timeout)s s'%profiles['default'],
parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %d s'%DefaultProfile.timeout,
default=None, type='int', dest='timeout')
parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %(max_recursions)s'%profiles['default'],
parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %d'%DefaultProfile.timeout,
default=None, type='int', dest='max_recursions')
parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files',
help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %(max_files)s'%profiles['default'])
help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %d'%DefaultProfile.timeout)
parser.add_option('--delay', default=None, dest='delay', type='int',
help='Minimum interval in seconds between consecutive fetches. Default is %(delay)s s'%profiles['default'])
help='Minimum interval in seconds between consecutive fetches. Default is %d s'%DefaultProfile.timeout)
parser.add_option('--dont-download-stylesheets', action='store_true', default=None,
help='Do not download CSS stylesheets.', dest='no_stylesheets')
@ -85,45 +90,58 @@ def process_profile(args, options, logger=None):
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('web2lrf')
setup_cli_handlers(logger, level)
index = -1
if options.user_profile is not None:
path = os.path.abspath(options.user_profile)
name = os.path.splitext(os.path.basename(path))[0]
res = imp.find_module(name, [os.path.dirname(path)])
module = imp.load_module(name, *res)
classes = inspect.getmembers(module,
lambda x : inspect.isclass(x) and issubclass(x, DefaultProfile)\
and x is not DefaultProfile)
if not classes:
raise CommandLineError('Invalid user profile '+path)
builtin_profiles.append(classes[0][1])
available_profiles.append(name)
if len(args) < 2:
args.append('')
args[1] = name
if len(args) == 2:
if not profiles.has_key(args[1]):
raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys()))
profile = profiles[args[1]] if len(args) == 2 else profiles['default']
profile['username'] = options.username
profile['password'] = options.password
if profile.has_key('initialize'):
profile['initialize'](profile)
if profile.has_key('browser'):
options.browser = profile['browser']
try:
index = available_profiles.index(args[1])
except ValueError:
raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], available_profiles))
profile = DefaultProfile if index == -1 else builtin_profiles[index]
profile = profile(options.username, options.password)
if profile.browser is not None:
options.browser = profile.browser
for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
val = getattr(options, opt)
if val is None:
setattr(options, opt, profile[opt])
setattr(options, opt, getattr(profile, opt))
if not options.url:
options.url = profile['url']
options.url = profile.url
if not options.url:
raise CommandLineError('You must specify the --url option or a profile from one of: %s'%(available_profiles,))
if not options.title:
title = profile['title']
title = profile.title
if not title:
title = urlsplit(options.url).netloc
options.title = title + time.strftime(profile['timefmt'], time.localtime())
options.title = title + time.strftime(profile.timefmt, time.localtime())
options.match_regexps += profile['match_regexps']
options.preprocess_regexps = profile['preprocess_regexps']
options.filter_regexps += profile['filter_regexps']
options.match_regexps += profile.match_regexps
options.preprocess_regexps = profile.preprocess_regexps
options.filter_regexps += profile.filter_regexps
if len(args) == 2 and args[1] != 'default':
options.anchor_ids = False
htmlfile, tdir = fetch_website(options, logger)
create_lrf(htmlfile, options, logger)
finally:
if profile.has_key('finalize'):
profile['finalize'](profile)
if tdir and os.path.isdir(tdir):
shutil.rmtree(tdir)

View File

@ -1,81 +0,0 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import tempfile, shutil, os
from libprs500.ebooks.lrf.web import build_index, parse_feeds
RSS = 'http://economist.com/rss/'
TITLES = [
'The world this week',
'Letters',
'Briefings',
'Special reports',
'Britain',
'Europe',
'United States',
'The Americas',
'Middle East and Africa',
'Asia',
'International',
'Business',
'Finance and economics',
'Science and technology',
'Books and arts',
'Indicators'
]
from libprs500 import __appname__, iswindows, browser
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
def print_version(url):
return url.replace('displaystory', 'PrinterFriendly').replace('&fsrc=RSS', '')
def get_feeds(browser):
src = browser.open(RSS).read()
soup = BeautifulSoup(src)
feeds = []
for ul in soup.findAll('ul'):
lis = ul.findAll('li')
try:
title, link = lis[0], lis[1]
except IndexError:
continue
title = title.string
if title:
title = title.strip()
if title not in TITLES:
continue
a = link.find('a')
feeds.append((title, a['href'].strip()))
return feeds
def initialize(profile):
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
profile['browser'] = browser()
feeds = get_feeds(profile['browser'])
articles = parse_feeds(feeds, profile['browser'], print_version, max_articles_per_feed=20)
index = build_index('The Economist', articles, profile['temp dir'])
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
profile['timefmt'] = ' [%d %b %Y]'
profile['max_recursions'] = 3
profile['title'] = 'The Economist'
profile.pop('browser') # Needed as for some reason using the same browser instance causes timeouts
def finalize(profile):
if os.path.isdir(profile['temp dir']):
shutil.rmtree(profile['temp dir'])

View File

@ -1,73 +0,0 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''New York Times from RSS feeds.'''
import os, tempfile, shutil
from libprs500 import __appname__, iswindows, browser
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
from libprs500.ebooks.lrf.web import build_index, parse_feeds
RSS = 'http://www.nytimes.com/services/xml/rss/index.html'
LOGIN = 'http://www.nytimes.com/auth/login'
def get_feeds(browser):
src = browser.open(RSS).read()
soup = BeautifulSoup(src[src.index('<html'):])
feeds = []
for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts',
'Dining & Wine', 'Home & Garden', 'Multimedia',
'Most E-mailed Articles',
'Automobiles', 'Fashion & Style', 'Television News',
'Education']:
feeds.append((link['title'], link['href']))
#else: print link['title']
return feeds
def initialize(profile):
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
profile['browser'] = login(profile)
feeds = get_feeds(profile['browser'])
articles = parse_feeds(feeds, profile['browser'], lambda x: x + '?&pagewanted=print',
oldest_article=2)
index = build_index('The New York Times', articles, profile['temp dir'])
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
profile['timefmt'] = ' [%a, %d %b, %Y]'
profile['max_recursions'] = 2
profile['title'] = 'The New York Times'
def finalize(profile):
if os.path.isdir(profile['temp dir']):
shutil.rmtree(profile['temp dir'])
def login(profile):
br = browser()
if profile['username'] and profile['password']:
br.open(LOGIN)
br.select_form(name='login')
br['USERID'] = profile['username']
br['PASSWORD'] = profile['password']
br.submit()
return br
if __name__ == '__main__':
feeds = get_feeds()
articles = parse_feeds(feeds)
print articles

View File

@ -1,136 +0,0 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''Profiles for known websites.'''
import re
from libprs500.ebooks.lrf.web.newsweek import initialize as newsweek_initialize
from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize
from libprs500.ebooks.lrf.web.nytimes import initialize as nytimes_initialize
from libprs500.ebooks.lrf.web.nytimes import finalize as nytimes_finalize
from libprs500.ebooks.lrf.web.bbc import initialize as bbc_initialize
from libprs500.ebooks.lrf.web.bbc import finalize as bbc_finalize
from libprs500.ebooks.lrf.web.economist import initialize as economist_initialize
from libprs500.ebooks.lrf.web.economist import finalize as economist_finalize
profiles = {
'default' : {
'url' : '', # The URL of the website
'title' : '', # The title to use for the LRF file
'max_recursions' : 1, # Number of levels of links to follow
'max_files' : 1000, # Maximum number of files to download
'delay' : 0, # Delay between consecutive downloads
'timeout' : 10, # Timeout for fetching files from server in seconds
'timefmt' : ' [%a %d %b %Y]',
'no_stylesheets' : False, # Download stylesheets
'match_regexps' : [], # List of regular expressions that determines which links to follow
'filter_regexps' : [], # List of regular expressions that determines which links to ignore
# Only one of match_regexps or filter_regexps should be defined
'html2lrf_options' : [], # List of options to pass to html2lrf
'preprocess_regexps': [], # List of regexp substitution rules to run on the downloaded HTML before running html2lrf
# See the profiles below for examples of these settings.
},
'nytimes' : {
'initialize' : nytimes_initialize,
'finalize' : nytimes_finalize,
'preprocess_regexps' :
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
# Remove header bar
(r'(<body.*?>).*?<h1', lambda match: match.group(1)+'<h1'),
(r'<div class="articleTools">.*></ul>', lambda match : ''),
# Remove footer bar
(r'<\!-- end \#article -->.*', lambda match : '</body></html>'),
(r'<div id="footer">.*', lambda match : '</body></html>'),
]
],
},
'bbc' : {
'initialize' : bbc_initialize,
'finalize' : bbc_finalize,
'preprocess_regexps' :
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
# Remove footer from individual stories
(r'<div class=.footer.>.*?Published',
lambda match : '<p></p><div class="footer">Published'),
# Add some style info in place of disabled stylesheet
(r'<link.*?type=.text/css.*?>', lambda match :
'''<style type="text/css">
.headline {font-size: x-large;}
.fact { padding-top: 10pt }
</style>'''),
]
],
},
'newsweek' : {
'initialize' : newsweek_initialize,
'finalize' : newsweek_finalize,
'preprocess_regexps' :
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
# Make fonts larger
(r'<style.*?\.copyright.*?</style>',
lambda match : \
'''<style type="text/css">'''
'''updateTime{font:small Arial;color:#000000;}'''
'''.credit{font:small Arial;color:#999999;}'''
'''.head{font:bold 18pt x-large;color:#CC0000;}'''
'''.abstract{font:14pt large Verdana;color:#000000;}'''
'''.title{font:bold;color:#000000;}'''
'''.source{font:bold small Verdana;color:#CC0000;}'''
'''.footerLink{font:bold Verdana;color:#000000;}'''
'''.caption{font: Verdana;color:#000000;}'''
'''.textBodyBlack, .copyright{font: Verdana;color:#000000;}'''
'''.copyright{font-style:italic;}'''
'''</style>'''
),
]
],
},
'economist' : {
'initialize' : economist_initialize,
'finalize' : economist_finalize,
'preprocess_regexps' :
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
# Remove advert
(r'<noscript.*?</noscript>', lambda match: ''),
]
],
},
}
for key in profiles.keys():
if key == 'default':
continue
newd = profiles['default'].copy()
newd.update(profiles[key])
profiles[key] = newd
def profile_to_command_line_options(profile):
args = []
args.append('--max-recursions='+str(profile['max_recursions']))
args.append('--delay='+str(profile['delay']))
for i in profile['match_regexps']:
args.append('--match-regexp="'+i+'"')
for i in profile['filter_regexps']:
args.append('--filter-regexp="'+i+'"')
return args

View File

@ -0,0 +1,227 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
'''
import tempfile, time, calendar, re, operator
from htmlentitydefs import name2codepoint
from libprs500 import __appname__, iswindows, browser
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
class DefaultProfile(object):
url = '' # The URL of the website
title = 'Default Profile' # The title to use for the LRF file
max_articles_per_feed = 10 # Maximum number of articles to download from each feed
html_description = False # If True process the <description> element of the feed as HTML
oldest_article = 7 # How many days old should the oldest article downloaded from the feeds be?
max_recursions = 1 # Number of levels of links to follow
max_files = 3000 # Maximum number of files to download
delay = 0 # Delay between consecutive downloads
timeout = 10 # Timeout for fetching files from server in seconds
timefmt = ' [%a %d %b %Y]' # The format of the date shown on the first page
no_stylesheets = False # Download stylesheets only if False
match_regexps = [] # List of regular expressions that determines which links to follow
filter_regexps = [] # List of regular expressions that determines which links to ignore
# Only one of match_regexps or filter_regexps should be defined
html2lrf_options = [] # List of options to pass to html2lrf
# List of regexp substitution rules to run on the downloaded HTML. Each element of the
# list should be a two element tuple. THe first element of the tuple should
# be a compiled regular expression and the second a callable that takes
# a single match object and returns a string to replace the match.
preprocess_regexps = []
# See the built-in profiles for examples of these settings.
def get_feeds(self):
'''
Return a list of RSS feeds to fetch for this profile. Each element of the list
must be a 2-element tuple of the form (title, url).
'''
raise NotImplementedError
@classmethod
def print_version(cls, url):
'''
Takea a URL pointing to an article and returns the URL pointing to the
print version of the article.
'''
return url
@classmethod
def get_browser(cls):
'''
Return a browser instance used to fetch documents from the web.
If your profile requires that you login first, override this method
in your subclass. See for example the nytimes profile.
'''
return browser()
########################################################################
###################### End of customizable portion #####################
########################################################################
def __init__(self, username=None, password=None):
self.username = username
self.password = password
self.temp_dir = tempfile.mkdtemp(prefix=__appname__+'_')
self.browser = self.get_browser()
self.url = 'file:'+ ('' if iswindows else '//') + self.build_index()
def __del__(self):
import os, shutil
if os.path.isdir(self.temp_dir):
shutil.rmtree(self.temp_dir)
def build_index(self):
'''Build an RSS based index.html'''
import os
articles = self.parse_feeds()
def build_sub_index(title, items):
ilist = ''
li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
for item in items:
ilist += li%item
return u'''\
<html>
<body>
<h2>%(title)s</h2>
<ul>
%(items)s
</ul>
</body>
</html>
'''%dict(title=title, items=ilist.rstrip())
cnum = 0
clist = ''
categories = articles.keys()
categories.sort()
for category in categories:
cnum += 1
cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html')
prefix = 'file:' if iswindows else ''
clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
src = build_sub_index(category, articles[category])
open(cfile, 'wb').write(src.encode('utf-8'))
src = '''\
<html>
<body>
<h1>%(title)s</h1>
<div style='text-align: right; font-weight: bold'>%(date)s</div>
<ul>
%(categories)s
</ul>
</body>
</html>
'''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()),
categories=clist, title=self.title)
index = os.path.join(self.temp_dir, 'index.html')
open(index, 'wb').write(src.encode('utf-8'))
return index
def parse_feeds(self):
feeds = self.get_feeds()
articles = {}
for title, url in feeds:
try:
src = self.browser.open(url).read()
except Exception, err:
print 'Could not fetch feed: %s\nError: %s'%(url, err)
continue
articles[title] = []
soup = BeautifulStoneSoup(src)
for item in soup.findAll('item'):
try:
pubdate = item.find('pubdate').string
if not pubdate:
continue
pubdate = pubdate.replace('+0000', 'GMT')
d = {
'title' : item.find('title').string,
'url' : self.print_version(item.find('guid').string),
'timestamp': calendar.timegm(self.strptime(pubdate)),
'date' : pubdate
}
delta = time.time() - d['timestamp']
if delta > self.oldest_article*3600*24:
continue
except Exception, err:
continue
try:
desc = item.find('description')
d['description'] = self.process_html_description(desc) if self.html_description else desc.string
except:
d['description'] = ''
articles[title].append(d)
articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
articles[title][self.max_articles_per_feed:] = []
for item in articles[title]:
item.pop('timestamp')
if not articles[title]:
articles.pop(title)
return articles
@classmethod
def process_html_description(cls, tag):
src = '\n'.join(tag.contents)
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
for e in replaced_entities:
ent = '&'+e+';'
src = src.replace(ent, unichr(name2codepoint[e]))
return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12)
FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6,
July=7, August=8, September=9, October=10,
November=11, December=12)
@classmethod
def strptime(cls, src):
src = src.strip().split()
src[0] = str(cls.DAY_MAP[src[0][:-1]])+','
try:
src[2] = str(cls.MONTH_MAP[src[2]])
except KeyError:
src[2] = str(cls.FULL_MONTH_MAP[src[2]])
return time.strptime(' '.join(src), '%w, %d %m %Y %H:%M:%S %Z')
def command_line_options(self):
args = []
args.append('--max-recursions='+str(self.max_recursions))
args.append('--delay='+str(self.delay))
args.append('--max-files='+str(self.max_files))
for i in self.match_regexps:
args.append('--match-regexp="'+i+'"')
for i in self.filter_regexps:
args.append('--filter-regexp="'+i+'"')
return args

View File

@ -0,0 +1,57 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Fetch the BBC.
'''
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
class BBC(DefaultProfile):
title = 'The BBC'
max_recursions = 2
timefmt = ' [%a, %d %b, %Y]'
no_stylesheets = True
preprocess_regexps = \
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
# Remove footer from individual stories
(r'<div class=.footer.>.*?Published',
lambda match : '<p></p><div class="footer">Published'),
# Add some style info in place of disabled stylesheet
(r'<link.*?type=.text/css.*?>', lambda match :
'''<style type="text/css">
.headline {font-size: x-large;}
.fact { padding-top: 10pt }
</style>'''),
]
]
def print_version(self, url):
return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/')
def get_feeds(self):
src = self.browser.open('http://news.bbc.co.uk/1/hi/help/3223484.stm').read()
soup = BeautifulSoup(src[src.index('<html'):])
feeds = []
ul = soup.find('ul', attrs={'class':'rss'})
for link in ul.findAll('a'):
feeds.append((link.string, link['href']))
return feeds

View File

@ -0,0 +1,81 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Fetch The Economist.
'''
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
class Economist(DefaultProfile):
title = 'The Economist'
timefmt = ' [%d %b %Y]'
max_recursions = 3
TITLES = [
'The world this week',
'Letters',
'Briefings',
'Special reports',
'Britain',
'Europe',
'United States',
'The Americas',
'Middle East and Africa',
'Asia',
'International',
'Business',
'Finance and economics',
'Science and technology',
'Books and arts',
'Indicators'
]
preprocess_regexps = \
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
# Remove advert
(r'<noscript.*?</noscript>', lambda match: ''),
]
]
def __init__(self, username=None, password=None):
DefaultProfile.__init__(self, username, password)
self.browser = None # Needed as otherwise there are timeouts while fetching actual articles
def print_version(self, url):
return url.replace('displaystory', 'PrinterFriendly').replace('&fsrc=RSS', '')
def get_feeds(self):
src = self.browser.open('http://economist.com/rss/').read()
soup = BeautifulSoup(src)
feeds = []
for ul in soup.findAll('ul'):
lis = ul.findAll('li')
try:
title, link = lis[0], lis[1]
except IndexError:
continue
title = title.string
if title:
title = title.strip()
if title not in self.__class__.TITLES:
continue
a = link.find('a')
feeds.append((title, a['href'].strip()))
return feeds

View File

@ -12,14 +12,27 @@
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''Logic to create a Newsweek HTML aggregator from RSS feeds'''
'''
Profile to download Newsweek
'''
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
import tempfile, os, shutil
from libprs500.ebooks.lrf.web import build_index, parse_feeds
from libprs500 import __appname__, iswindows, browser
RSS_FEEDS = [
class Newsweek(DefaultProfile):
title = 'Newsweek'
max_recursions = 2
timefmt = ' [%d %b %Y]'
html_description = True
oldest_article = 15
def print_version(self, url):
if not url.endswith('/'):
url += '/'
return url + 'output/print'
def get_feeds(self):
return [
('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'),
('Politics', 'http://feeds.newsweek.com/headlines/politics'),
@ -32,28 +45,5 @@ RSS_FEEDS = [
('Society', 'http://feeds.newsweek.com/newsweek/society'),
('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
]
def print_version(url):
if '?' in url:
url = url[:url.index('?')]
if not url.endswith('/'):
url += '/'
return url + 'output/print'
def initialize(profile):
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
profile['browser'] = browser()
articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version,
max_articles_per_feed=20, oldest_article=15,
html_description=True)
index = build_index('Newsweek', articles, profile['temp dir'])
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
profile['timefmt'] = ' [%d %b %Y]'
profile['max_recursions'] = 2
profile['title'] = 'Newsweek'
profile['url'] = 'file:'+ ('' if iswindows else '//') +index
def finalize(profile):
if os.path.isdir(profile['temp dir']):
shutil.rmtree(profile['temp dir'])

View File

@ -0,0 +1,66 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Profile to download the New York Times
'''
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
class NYTimes(DefaultProfile):
title = 'The New York Times'
timefmt = ' [%a, %d %b, %Y]'
max_recursions = 2
preprocess_regexps = \
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
# Remove header bar
(r'(<body.*?>).*?<h1', lambda match: match.group(1)+'<h1'),
(r'<div class="articleTools">.*></ul>', lambda match : ''),
# Remove footer bar
(r'<\!-- end \#article -->.*', lambda match : '</body></html>'),
(r'<div id="footer">.*', lambda match : '</body></html>'),
]
]
def browser(self):
br = DefaultProfile.browser(self)
if self.username is not None and self.password is not None:
br.open('http://www.nytimes.com/auth/login')
br.select_form(name='login')
br['USERID'] = self.username
br['PASSWORD'] = self.password
br.submit()
return br
def get_feeds(self):
src = self.browser.open('http://www.nytimes.com/services/xml/rss/index.html').read()
soup = BeautifulSoup(src[src.index('<html'):])
feeds = []
for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts',
'Dining & Wine', 'Home & Garden', 'Multimedia',
'Most E-mailed Articles',
'Automobiles', 'Fashion & Style', 'Television News',
'Education']:
feeds.append((link['title'], link['href']))
return feeds
def print_version(self, url):
return url + '?&pagewanted=print'