mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-30 18:22:25 -04:00 
			
		
		
		
	Add support for user profiles in web2lrf
This commit is contained in:
		
							parent
							
								
									275b59a2e7
								
							
						
					
					
						commit
						8799a6f3f2
					
				| @ -13,134 +13,3 @@ | ||||
| ##    with this program; if not, write to the Free Software Foundation, Inc., | ||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
| 
 | ||||
| import os, time, calendar, operator, re | ||||
| 
 | ||||
| from libprs500 import iswindows | ||||
| from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup | ||||
| from htmlentitydefs import name2codepoint | ||||
| 
 | ||||
| DAY_MAP   = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6) | ||||
| MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12) | ||||
| FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6,  | ||||
|                       July=7, August=8, September=9, October=10,  | ||||
|                       November=11, December=12) | ||||
| 
 | ||||
| def strptime(src): | ||||
|     src = src.strip().split() | ||||
|     src[0] = str(DAY_MAP[src[0][:-1]])+',' | ||||
|     try: | ||||
|         src[2] = str(MONTH_MAP[src[2]]) | ||||
|     except KeyError: | ||||
|         src[2] = str(FULL_MONTH_MAP[src[2]]) | ||||
|     return time.strptime(' '.join(src), '%w, %d %m %Y %H:%M:%S %Z') | ||||
| 
 | ||||
| def process_html_description(tag): | ||||
|         src = '\n'.join(tag.contents) | ||||
|         replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] | ||||
|         for e in replaced_entities: | ||||
|             ent = '&'+e+';' | ||||
|             src = src.replace(ent, unichr(name2codepoint[e])) | ||||
|         return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src) | ||||
| 
 | ||||
| def parse_feeds(feeds, browser, print_version,  | ||||
|                 max_articles_per_feed=10,  | ||||
|                 html_description=False, | ||||
|                 oldest_article=7): | ||||
|     ''' | ||||
|     @param print_version: Callable that takes a url string and returns the url to  | ||||
|                           printable version of the article pointed to by the original url. | ||||
|     @param max_articles_per_feed: Maximum number of articles to download from each feed | ||||
|     @param html_description: If true the atricles descriptions are processed as HTML | ||||
|     @param oldest_article: A number in days. No articles older than now - oldest_aticle  | ||||
|                            will be downloaded.   | ||||
|     ''' | ||||
|     articles = {} | ||||
|     for title, url in feeds: | ||||
|         try: | ||||
|             src = browser.open(url).read() | ||||
|         except Exception, err: | ||||
|             print 'Could not fetch feed: %s\nError: %s'%(url, err) | ||||
|             continue | ||||
|          | ||||
|         articles[title] = [] | ||||
|         soup = BeautifulStoneSoup(src) | ||||
|         for item in soup.findAll('item'): | ||||
|             try: | ||||
|                 pubdate = item.find('pubdate').string | ||||
|                 if not pubdate: | ||||
|                     continue | ||||
|                 pubdate = pubdate.replace('+0000', 'GMT') | ||||
|                 d = {  | ||||
|                     'title'    : item.find('title').string,                  | ||||
|                     'url'      : print_version(item.find('guid').string), | ||||
|                     'timestamp': calendar.timegm(strptime(pubdate)), | ||||
|                     'date'     : pubdate | ||||
|                     } | ||||
|                 delta = time.time() - d['timestamp'] | ||||
|                 if delta > oldest_article*3600*24: | ||||
|                     continue | ||||
|                   | ||||
|             except Exception, err: | ||||
|                 continue | ||||
|             try: | ||||
|                 desc = item.find('description') | ||||
|                 d['description'] = process_html_description(desc) if  html_description else desc.string                     | ||||
|             except: | ||||
|                 d['description'] = '' | ||||
|             articles[title].append(d) | ||||
|         articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True) | ||||
|         articles[title][max_articles_per_feed:] = [] | ||||
|         for item in articles[title]: | ||||
|             item.pop('timestamp') | ||||
|         if not articles[title]: | ||||
|             articles.pop(title) | ||||
|     return articles | ||||
| 
 | ||||
| 
 | ||||
| def build_index(title, articles, dir): | ||||
|     '''Build an RSS based index.html''' | ||||
| 
 | ||||
|     def build_sub_index(title, items): | ||||
|         ilist = '' | ||||
|         li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\ | ||||
|             u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n' | ||||
|         for item in items: | ||||
|             ilist += li%item | ||||
|         return u'''\ | ||||
|         <html> | ||||
|         <body> | ||||
|         <h2>%(title)s</h2> | ||||
|         <ul> | ||||
|         %(items)s | ||||
|         </ul> | ||||
|         </body> | ||||
|         </html> | ||||
|         '''%dict(title=title, items=ilist.rstrip())         | ||||
|      | ||||
|     cnum = 0 | ||||
|     clist = '' | ||||
|     categories = articles.keys() | ||||
|     categories.sort() | ||||
|     for category in categories: | ||||
|         cnum  += 1 | ||||
|         cfile = os.path.join(dir, 'category'+str(cnum)+'.html') | ||||
|         prefix = 'file:' if iswindows else '' | ||||
|         clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category) | ||||
|         src = build_sub_index(category, articles[category]) | ||||
|         open(cfile, 'wb').write(src.encode('utf-8'))         | ||||
|      | ||||
|     src = '''\ | ||||
|     <html> | ||||
|     <body> | ||||
|     <h1>%(title)s</h1> | ||||
|     <div style='text-align: right; font-weight: bold'>%(date)s</div> | ||||
|     <ul> | ||||
|     %(categories)s | ||||
|     </ul> | ||||
|     </body> | ||||
|     </html> | ||||
|     '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()),  | ||||
|              categories=clist, title=title) | ||||
|     index = os.path.join(dir, 'index.html') | ||||
|     open(index, 'wb').write(src.encode('utf-8')) | ||||
|     return index | ||||
|  | ||||
| @ -1,53 +0,0 @@ | ||||
| ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net | ||||
| ##    This program is free software; you can redistribute it and/or modify | ||||
| ##    it under the terms of the GNU General Public License as published by | ||||
| ##    the Free Software Foundation; either version 2 of the License, or | ||||
| ##    (at your option) any later version. | ||||
| ## | ||||
| ##    This program is distributed in the hope that it will be useful, | ||||
| ##    but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| ##    GNU General Public License for more details. | ||||
| ## | ||||
| ##    You should have received a copy of the GNU General Public License along | ||||
| ##    with this program; if not, write to the Free Software Foundation, Inc., | ||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
| 
 | ||||
| 
 | ||||
| import tempfile, shutil, os | ||||
| from libprs500.ebooks.lrf.web import build_index, parse_feeds | ||||
| 
 | ||||
| RSS = 'http://news.bbc.co.uk/1/hi/help/3223484.stm' | ||||
| 
 | ||||
| from libprs500 import __appname__, iswindows, browser | ||||
| from libprs500.ebooks.BeautifulSoup import BeautifulSoup | ||||
| 
 | ||||
| 
 | ||||
| def get_feeds(browser): | ||||
|     src = browser.open(RSS).read() | ||||
|     soup = BeautifulSoup(src[src.index('<html'):]) | ||||
|     feeds = [] | ||||
|     ul =  soup.find('ul', attrs={'class':'rss'}) | ||||
|     for link in ul.findAll('a'): | ||||
|         feeds.append((link.string, link['href'])) | ||||
|     return feeds | ||||
| 
 | ||||
| def initialize(profile): | ||||
|     profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_') | ||||
|     profile['browser'] = browser() | ||||
|     feeds = get_feeds(profile['browser']) | ||||
|     articles = parse_feeds(feeds, profile['browser'], lambda x: x.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/')) | ||||
|     index = build_index('The BBC', articles, profile['temp dir']) | ||||
|     profile['url'] = 'file:'+ ('' if iswindows else '//') + index | ||||
|     profile['timefmt'] = ' [%a, %d %b, %Y]' | ||||
|     profile['max_recursions'] =  2                  | ||||
|     profile['title']          = 'The BBC' | ||||
|     profile['no_stylesheets'] = True | ||||
|      | ||||
| def finalize(profile): | ||||
|     if os.path.isdir(profile['temp dir']): | ||||
|         shutil.rmtree(profile['temp dir']) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|      | ||||
| @ -14,43 +14,48 @@ | ||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
| '''Convert known websites into LRF files.''' | ||||
| 
 | ||||
| import sys, time, tempfile, shutil, os, logging | ||||
| import sys, time, tempfile, shutil, os, logging, imp, inspect | ||||
| from urlparse import urlsplit | ||||
| 
 | ||||
| from libprs500 import __appname__, setup_cli_handlers, CommandLineError | ||||
| from libprs500.ebooks.lrf import option_parser as lrf_option_parser | ||||
| from libprs500.ebooks.lrf.html.convert_from import process_file | ||||
| from libprs500.ebooks.lrf.web.profiles import profiles | ||||
| 
 | ||||
| from libprs500.web.fetch.simple import create_fetcher | ||||
| 
 | ||||
| available_profiles = profiles.keys() | ||||
| available_profiles.remove('default') | ||||
| available_profiles = ' '.join(available_profiles) | ||||
| from libprs500.ebooks.lrf.web.profiles import DefaultProfile | ||||
| from libprs500.ebooks.lrf.web.profiles.nytimes import NYTimes | ||||
| from libprs500.ebooks.lrf.web.profiles.bbc import BBC | ||||
| from libprs500.ebooks.lrf.web.profiles.newsweek import Newsweek | ||||
| 
 | ||||
| builtin_profiles   = [NYTimes, BBC, Newsweek] | ||||
| available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles]  | ||||
| 
 | ||||
| def option_parser(): | ||||
|     parser = lrf_option_parser(usage='''%prog [options] website_profile\n\n''' | ||||
|                           '''%prog downloads a site from the web and converts it ''' | ||||
|                           '''into a LRF file for use with the SONY Reader. ''' | ||||
|                           '''website_profile is one of '''+available_profiles+\ | ||||
|                           '''website_profile is one of '''+str(available_profiles)+\ | ||||
|                           ''' If you specify a website_profile of default or do not specify ''' | ||||
|                           '''it, you must specify the --url option.''' | ||||
|                           ) | ||||
|      | ||||
|     parser.add_option('-u', '--url', dest='url', default=None,   | ||||
|                       help='The URL to download. You only need to specify this if you are not specifying a website_profile.') | ||||
|      | ||||
|     parser.add_option('--user-profile', default=None, | ||||
|                       help='Path to a python file containing a user created profile.') | ||||
|     parser.add_option('--username', dest='username', default=None,  | ||||
|                       help='Specify the username to be used while downloading. Only used if the profile supports it.') | ||||
|     parser.add_option('--password', dest='password', default=None, | ||||
|                       help='Specify the password to be used while downloading. Only used if the profile supports it.') | ||||
|     parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %(timeout)s s'%profiles['default'], | ||||
|     parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %d s'%DefaultProfile.timeout, | ||||
|                       default=None, type='int', dest='timeout') | ||||
|     parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %(max_recursions)s'%profiles['default'], | ||||
|     parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %d'%DefaultProfile.timeout, | ||||
|                       default=None, type='int', dest='max_recursions') | ||||
|     parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files', | ||||
|                       help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %(max_files)s'%profiles['default']) | ||||
|                       help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %d'%DefaultProfile.timeout) | ||||
|     parser.add_option('--delay', default=None, dest='delay', type='int', | ||||
|                       help='Minimum interval in seconds between consecutive fetches. Default is %(delay)s s'%profiles['default']) | ||||
|                       help='Minimum interval in seconds between consecutive fetches. Default is %d s'%DefaultProfile.timeout) | ||||
|     parser.add_option('--dont-download-stylesheets', action='store_true', default=None, | ||||
|                       help='Do not download CSS stylesheets.', dest='no_stylesheets')     | ||||
|      | ||||
| @ -85,45 +90,58 @@ def process_profile(args, options, logger=None): | ||||
|             level = logging.DEBUG if options.verbose else logging.INFO | ||||
|             logger = logging.getLogger('web2lrf') | ||||
|             setup_cli_handlers(logger, level) | ||||
|         index = -1 | ||||
|         if options.user_profile is not None: | ||||
|             path = os.path.abspath(options.user_profile) | ||||
|             name = os.path.splitext(os.path.basename(path))[0] | ||||
|             res = imp.find_module(name, [os.path.dirname(path)]) | ||||
|             module =  imp.load_module(name, *res) | ||||
|             classes = inspect.getmembers(module,  | ||||
|                 lambda x : inspect.isclass(x) and issubclass(x, DefaultProfile)\ | ||||
|                            and x is not DefaultProfile) | ||||
|             if not classes: | ||||
|                 raise CommandLineError('Invalid user profile '+path) | ||||
|             builtin_profiles.append(classes[0][1]) | ||||
|             available_profiles.append(name) | ||||
|             if len(args) < 2: | ||||
|                 args.append('') | ||||
|             args[1] = name | ||||
|         if len(args) == 2: | ||||
|             if not profiles.has_key(args[1]): | ||||
|                 raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys())) | ||||
|         profile = profiles[args[1]] if len(args) == 2 else profiles['default'] | ||||
|         profile['username'] = options.username | ||||
|         profile['password'] = options.password | ||||
|         if profile.has_key('initialize'): | ||||
|             profile['initialize'](profile) | ||||
|         if profile.has_key('browser'): | ||||
|             options.browser = profile['browser'] | ||||
|             try: | ||||
|                 index = available_profiles.index(args[1]) | ||||
|             except ValueError: | ||||
|                 raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], available_profiles)) | ||||
|         profile = DefaultProfile if index == -1 else builtin_profiles[index] | ||||
|         profile = profile(options.username, options.password) | ||||
|         if profile.browser is not None: | ||||
|             options.browser = profile.browser | ||||
|          | ||||
|         for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'): | ||||
|             val = getattr(options, opt) | ||||
|             if val is None: | ||||
|                 setattr(options, opt, profile[opt]) | ||||
|                 setattr(options, opt, getattr(profile, opt)) | ||||
|          | ||||
|         if not options.url: | ||||
|             options.url = profile['url']             | ||||
|             options.url = profile.url             | ||||
|          | ||||
|         if not options.url: | ||||
|             raise CommandLineError('You must specify the --url option or a profile from one of: %s'%(available_profiles,)) | ||||
|          | ||||
|         if not options.title: | ||||
|             title = profile['title'] | ||||
|             title = profile.title | ||||
|             if not title: | ||||
|                 title = urlsplit(options.url).netloc | ||||
|             options.title = title + time.strftime(profile['timefmt'], time.localtime()) | ||||
|             options.title = title + time.strftime(profile.timefmt, time.localtime()) | ||||
|          | ||||
|         options.match_regexps += profile['match_regexps'] | ||||
|         options.preprocess_regexps = profile['preprocess_regexps'] | ||||
|         options.filter_regexps += profile['filter_regexps'] | ||||
|         options.match_regexps += profile.match_regexps | ||||
|         options.preprocess_regexps = profile.preprocess_regexps | ||||
|         options.filter_regexps += profile.filter_regexps | ||||
|         if len(args) == 2 and args[1] != 'default': | ||||
|             options.anchor_ids = False | ||||
|          | ||||
|         htmlfile, tdir = fetch_website(options, logger) | ||||
|         create_lrf(htmlfile, options, logger) | ||||
|     finally: | ||||
|         if profile.has_key('finalize'): | ||||
|             profile['finalize'](profile) | ||||
|         if tdir and os.path.isdir(tdir): | ||||
|             shutil.rmtree(tdir) | ||||
|      | ||||
|  | ||||
| @ -1,81 +0,0 @@ | ||||
| ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net | ||||
| ##    This program is free software; you can redistribute it and/or modify | ||||
| ##    it under the terms of the GNU General Public License as published by | ||||
| ##    the Free Software Foundation; either version 2 of the License, or | ||||
| ##    (at your option) any later version. | ||||
| ## | ||||
| ##    This program is distributed in the hope that it will be useful, | ||||
| ##    but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| ##    GNU General Public License for more details. | ||||
| ## | ||||
| ##    You should have received a copy of the GNU General Public License along | ||||
| ##    with this program; if not, write to the Free Software Foundation, Inc., | ||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
| 
 | ||||
| 
 | ||||
| import tempfile, shutil, os | ||||
| from libprs500.ebooks.lrf.web import build_index, parse_feeds | ||||
| 
 | ||||
| RSS = 'http://economist.com/rss/' | ||||
| TITLES = [ | ||||
|           'The world this week', | ||||
|           'Letters', | ||||
|           'Briefings', | ||||
|           'Special reports', | ||||
|           'Britain', | ||||
|           'Europe', | ||||
|           'United States', | ||||
|           'The Americas', | ||||
|           'Middle East and Africa', | ||||
|           'Asia', | ||||
|           'International', | ||||
|           'Business', | ||||
|           'Finance and economics', | ||||
|           'Science and technology', | ||||
|           'Books and arts', | ||||
|           'Indicators' | ||||
|           ] | ||||
| 
 | ||||
| from libprs500 import __appname__, iswindows, browser | ||||
| from libprs500.ebooks.BeautifulSoup import BeautifulSoup | ||||
| 
 | ||||
| def print_version(url): | ||||
|     return url.replace('displaystory', 'PrinterFriendly').replace('&fsrc=RSS', '') | ||||
| 
 | ||||
| def get_feeds(browser): | ||||
|     src = browser.open(RSS).read() | ||||
|     soup = BeautifulSoup(src) | ||||
|     feeds = [] | ||||
|     for ul in soup.findAll('ul'): | ||||
|         lis =  ul.findAll('li') | ||||
|         try: | ||||
|             title, link = lis[0], lis[1] | ||||
|         except IndexError: | ||||
|             continue | ||||
|         title = title.string | ||||
|         if title: | ||||
|             title = title.strip() | ||||
|         if title not in TITLES: | ||||
|             continue | ||||
|         a = link.find('a') | ||||
|         feeds.append((title, a['href'].strip())) | ||||
|          | ||||
|     return feeds | ||||
|              | ||||
| def initialize(profile): | ||||
|     profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_') | ||||
|     profile['browser'] = browser() | ||||
|     feeds = get_feeds(profile['browser']) | ||||
|     articles = parse_feeds(feeds, profile['browser'], print_version, max_articles_per_feed=20) | ||||
|     index = build_index('The Economist', articles, profile['temp dir']) | ||||
|     profile['url'] = 'file:'+ ('' if iswindows else '//') + index | ||||
|     profile['timefmt'] = ' [%d %b %Y]' | ||||
|     profile['max_recursions'] =  3                 | ||||
|     profile['title']          = 'The Economist' | ||||
|     profile.pop('browser') # Needed as for some reason using the same browser instance causes timeouts | ||||
|          | ||||
| def finalize(profile): | ||||
|     if os.path.isdir(profile['temp dir']): | ||||
|         shutil.rmtree(profile['temp dir']) | ||||
|      | ||||
| @ -1,73 +0,0 @@ | ||||
| ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net | ||||
| ##    This program is free software; you can redistribute it and/or modify | ||||
| ##    it under the terms of the GNU General Public License as published by | ||||
| ##    the Free Software Foundation; either version 2 of the License, or | ||||
| ##    (at your option) any later version. | ||||
| ## | ||||
| ##    This program is distributed in the hope that it will be useful, | ||||
| ##    but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| ##    GNU General Public License for more details. | ||||
| ## | ||||
| ##    You should have received a copy of the GNU General Public License along | ||||
| ##    with this program; if not, write to the Free Software Foundation, Inc., | ||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
| '''New York Times from RSS feeds.''' | ||||
| import os, tempfile, shutil | ||||
| 
 | ||||
| from libprs500 import __appname__, iswindows, browser | ||||
| from libprs500.ebooks.BeautifulSoup import BeautifulSoup | ||||
| from libprs500.ebooks.lrf.web import build_index, parse_feeds | ||||
| 
 | ||||
| RSS = 'http://www.nytimes.com/services/xml/rss/index.html' | ||||
| LOGIN = 'http://www.nytimes.com/auth/login' | ||||
| 
 | ||||
| def get_feeds(browser): | ||||
|     src = browser.open(RSS).read() | ||||
|     soup = BeautifulSoup(src[src.index('<html'):]) | ||||
|     feeds = [] | ||||
|     for link in soup.findAll('link', attrs={'type':'application/rss+xml'}): | ||||
|         if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts',  | ||||
|                                  'Dining & Wine', 'Home & Garden', 'Multimedia', | ||||
|                                  'Most E-mailed Articles',  | ||||
|                                  'Automobiles', 'Fashion & Style', 'Television News', | ||||
|                                  'Education']: | ||||
|             feeds.append((link['title'], link['href'])) | ||||
|         #else: print link['title'] | ||||
|      | ||||
|     return feeds | ||||
| 
 | ||||
| def initialize(profile): | ||||
|     profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_') | ||||
|     profile['browser'] = login(profile) | ||||
|     feeds = get_feeds(profile['browser']) | ||||
|     articles = parse_feeds(feeds, profile['browser'], lambda x: x + '?&pagewanted=print', | ||||
|                            oldest_article=2) | ||||
|     index = build_index('The New York Times', articles, profile['temp dir']) | ||||
|     profile['url'] = 'file:'+ ('' if iswindows else '//') + index | ||||
|     profile['timefmt'] = ' [%a, %d %b, %Y]' | ||||
|     profile['max_recursions'] =  2                  | ||||
|     profile['title']          = 'The New York Times' | ||||
|      | ||||
|      | ||||
| def finalize(profile): | ||||
|     if os.path.isdir(profile['temp dir']): | ||||
|         shutil.rmtree(profile['temp dir']) | ||||
|   | ||||
| 
 | ||||
| def login(profile): | ||||
|     br = browser() | ||||
|     if profile['username'] and profile['password']: | ||||
|         br.open(LOGIN) | ||||
|         br.select_form(name='login') | ||||
|         br['USERID']   = profile['username'] | ||||
|         br['PASSWORD'] = profile['password'] | ||||
|         br.submit() | ||||
|     return br | ||||
|        | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     feeds = get_feeds() | ||||
|     articles = parse_feeds(feeds) | ||||
|     print articles | ||||
| 
 | ||||
| @ -1,136 +0,0 @@ | ||||
| ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net | ||||
| ##    This program is free software; you can redistribute it and/or modify | ||||
| ##    it under the terms of the GNU General Public License as published by | ||||
| ##    the Free Software Foundation; either version 2 of the License, or | ||||
| ##    (at your option) any later version. | ||||
| ## | ||||
| ##    This program is distributed in the hope that it will be useful, | ||||
| ##    but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| ##    GNU General Public License for more details. | ||||
| ## | ||||
| ##    You should have received a copy of the GNU General Public License along | ||||
| ##    with this program; if not, write to the Free Software Foundation, Inc., | ||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
| '''Profiles for known websites.''' | ||||
| import re | ||||
| 
 | ||||
| from libprs500.ebooks.lrf.web.newsweek import initialize as newsweek_initialize | ||||
| from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize | ||||
| from libprs500.ebooks.lrf.web.nytimes import initialize as nytimes_initialize | ||||
| from libprs500.ebooks.lrf.web.nytimes import finalize as nytimes_finalize | ||||
| from libprs500.ebooks.lrf.web.bbc import initialize as bbc_initialize | ||||
| from libprs500.ebooks.lrf.web.bbc import finalize as bbc_finalize | ||||
| from libprs500.ebooks.lrf.web.economist import initialize as economist_initialize | ||||
| from libprs500.ebooks.lrf.web.economist import finalize as economist_finalize | ||||
| 
 | ||||
| 
 | ||||
| profiles = { | ||||
|             'default' : { | ||||
|                          'url'               : '',    # The URL of the website | ||||
|                          'title'             : '',    # The title to use for the LRF file | ||||
|                          'max_recursions'    : 1,     # Number of levels of links to follow | ||||
|                          'max_files'         : 1000,  # Maximum number of files to download | ||||
|                          'delay'             : 0,     # Delay between consecutive downloads | ||||
|                          'timeout'           : 10,    # Timeout for fetching files from server in seconds | ||||
|                          'timefmt'           : ' [%a %d %b %Y]', | ||||
|                          'no_stylesheets'    : False, # Download stylesheets  | ||||
|                          'match_regexps'     : [],    # List of regular expressions that determines which links to follow | ||||
|                          'filter_regexps'    : [],    # List of regular expressions that determines which links to ignore | ||||
|                          # Only one of match_regexps or filter_regexps should be defined | ||||
|                          'html2lrf_options'  : [],    # List of options to pass to html2lrf | ||||
|                          'preprocess_regexps': [],    # List of regexp substitution rules to run on the downloaded HTML before running html2lrf | ||||
|                          # See the profiles below for examples of these settings.  | ||||
|                        }, | ||||
|                         | ||||
|             'nytimes' : { | ||||
|                          'initialize'          : nytimes_initialize, | ||||
|                          'finalize'            : nytimes_finalize, | ||||
|                           | ||||
|                          'preprocess_regexps' : | ||||
|                          [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  | ||||
|                           [ | ||||
|                            # Remove header bar | ||||
|                            (r'(<body.*?>).*?<h1', lambda match: match.group(1)+'<h1'), | ||||
|                            (r'<div class="articleTools">.*></ul>', lambda match : ''), | ||||
|                            # Remove footer bar | ||||
|                            (r'<\!--  end \#article -->.*', lambda match : '</body></html>'), | ||||
|                            (r'<div id="footer">.*', lambda match : '</body></html>'), | ||||
|                            ] | ||||
|                           ], | ||||
|                          }, | ||||
|                           | ||||
|             'bbc'     : { | ||||
|                           'initialize'          : bbc_initialize, | ||||
|                           'finalize'            : bbc_finalize, | ||||
|                           'preprocess_regexps' : | ||||
|                          [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  | ||||
|                           [ | ||||
|                            # Remove footer from individual stories | ||||
|                            (r'<div class=.footer.>.*?Published',  | ||||
|                             lambda match : '<p></p><div class="footer">Published'), | ||||
|                            # Add some style info in place of disabled stylesheet | ||||
|                            (r'<link.*?type=.text/css.*?>', lambda match : | ||||
|                             '''<style type="text/css"> | ||||
|                                 .headline {font-size: x-large;} | ||||
|                                 .fact { padding-top: 10pt  } | ||||
|                                 </style>'''), | ||||
|                            ] | ||||
|                           ], | ||||
|                           }, | ||||
|              | ||||
|             'newsweek' : { | ||||
|                           'initialize'          : newsweek_initialize, | ||||
|                           'finalize'            : newsweek_finalize, | ||||
|                           'preprocess_regexps'  : | ||||
|                          [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  | ||||
|                           [ | ||||
|                            # Make fonts larger | ||||
|                            (r'<style.*?\.copyright.*?</style>',  | ||||
|                             lambda match : \ | ||||
|                         '''<style type="text/css">''' | ||||
|                         '''updateTime{font:small Arial;color:#000000;}''' | ||||
|                         '''.credit{font:small Arial;color:#999999;}''' | ||||
|                         '''.head{font:bold 18pt x-large;color:#CC0000;}''' | ||||
|                         '''.abstract{font:14pt large Verdana;color:#000000;}''' | ||||
|                         '''.title{font:bold;color:#000000;}''' | ||||
|                         '''.source{font:bold small Verdana;color:#CC0000;}''' | ||||
|                         '''.footerLink{font:bold Verdana;color:#000000;}''' | ||||
|                         '''.caption{font: Verdana;color:#000000;}''' | ||||
|                         '''.textBodyBlack, .copyright{font: Verdana;color:#000000;}''' | ||||
|                         '''.copyright{font-style:italic;}''' | ||||
|                         '''</style>''' | ||||
|                             ), | ||||
|                            ] | ||||
|                           ], | ||||
|                           },  | ||||
|                            | ||||
|             'economist' : { | ||||
|                            'initialize'          : economist_initialize, | ||||
|                            'finalize'            : economist_finalize, | ||||
|                            'preprocess_regexps' : | ||||
|                            [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  | ||||
|                             [ | ||||
|                              # Remove advert | ||||
|                              (r'<noscript.*?</noscript>', lambda match: ''), | ||||
|                              ] | ||||
|                             ],  | ||||
|                            },                                    | ||||
|             } | ||||
| 
 | ||||
| for key in profiles.keys(): | ||||
|     if key == 'default': | ||||
|         continue | ||||
|     newd = profiles['default'].copy() | ||||
|     newd.update(profiles[key]) | ||||
|     profiles[key] = newd | ||||
| 
 | ||||
| def profile_to_command_line_options(profile): | ||||
|     args = [] | ||||
|     args.append('--max-recursions='+str(profile['max_recursions'])) | ||||
|     args.append('--delay='+str(profile['delay'])) | ||||
|     for i in profile['match_regexps']: | ||||
|         args.append('--match-regexp="'+i+'"') | ||||
|     for i in profile['filter_regexps']: | ||||
|         args.append('--filter-regexp="'+i+'"') | ||||
|     return args | ||||
							
								
								
									
										227
									
								
								src/libprs500/ebooks/lrf/web/profiles/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										227
									
								
								src/libprs500/ebooks/lrf/web/profiles/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,227 @@ | ||||
| ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net | ||||
| ##    This program is free software; you can redistribute it and/or modify | ||||
| ##    it under the terms of the GNU General Public License as published by | ||||
| ##    the Free Software Foundation; either version 2 of the License, or | ||||
| ##    (at your option) any later version. | ||||
| ## | ||||
| ##    This program is distributed in the hope that it will be useful, | ||||
| ##    but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| ##    GNU General Public License for more details. | ||||
| ## | ||||
| ##    You should have received a copy of the GNU General Public License along | ||||
| ##    with this program; if not, write to the Free Software Foundation, Inc., | ||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
| ''' | ||||
| ''' | ||||
| 
 | ||||
| import tempfile, time, calendar, re, operator | ||||
| from htmlentitydefs import name2codepoint | ||||
| 
 | ||||
| from libprs500 import __appname__, iswindows, browser | ||||
| from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup | ||||
| 
 | ||||
| 
 | ||||
| class DefaultProfile(object): | ||||
|      | ||||
|     url                   = ''    # The URL of the website | ||||
|     title                 = 'Default Profile'    # The title to use for the LRF file | ||||
|     max_articles_per_feed = 10    # Maximum number of articles to download from each feed  | ||||
|     html_description      = False # If True process the <description> element of the feed as HTML | ||||
|     oldest_article        = 7     # How many days old should the oldest article downloaded from the feeds be? | ||||
|     max_recursions        = 1     # Number of levels of links to follow | ||||
|     max_files             = 3000  # Maximum number of files to download | ||||
|     delay                 = 0     # Delay between consecutive downloads | ||||
|     timeout               = 10    # Timeout for fetching files from server in seconds | ||||
|     timefmt               = ' [%a %d %b %Y]' # The format of the date shown on the first page | ||||
|     no_stylesheets        = False # Download stylesheets only if False  | ||||
|     match_regexps         = []    # List of regular expressions that determines which links to follow | ||||
|     filter_regexps        = []    # List of regular expressions that determines which links to ignore | ||||
|     # Only one of match_regexps or filter_regexps should be defined | ||||
|      | ||||
|     html2lrf_options   = []    # List of options to pass to html2lrf | ||||
|     # List of regexp substitution rules to run on the downloaded HTML. Each element of the  | ||||
|     # list should be a two element tuple. THe first element of the tuple should | ||||
|     # be a compiled regular expression and the second a callable that takes | ||||
|     # a single match object and returns a string to replace the match. | ||||
|     preprocess_regexps = [] | ||||
|      | ||||
|     # See the built-in profiles for examples of these settings. | ||||
|      | ||||
|     def get_feeds(self): | ||||
|         ''' | ||||
|         Return a list of RSS feeds to fetch for this profile. Each element of the list | ||||
|         must be a 2-element tuple of the form (title, url). | ||||
|         ''' | ||||
|         raise NotImplementedError | ||||
|      | ||||
|     @classmethod | ||||
|     def print_version(cls, url): | ||||
|         ''' | ||||
|         Takea a URL pointing to an article and returns the URL pointing to the | ||||
|         print version of the article. | ||||
|         ''' | ||||
|         return url | ||||
|      | ||||
|     @classmethod | ||||
|     def get_browser(cls): | ||||
|         ''' | ||||
|         Return a browser instance used to fetch documents from the web. | ||||
|          | ||||
|         If your profile requires that you login first, override this method | ||||
|         in your subclass. See for example the nytimes profile. | ||||
|         ''' | ||||
|         return browser() | ||||
|      | ||||
|     ######################################################################## | ||||
|     ###################### End of customizable portion ##################### | ||||
|     ######################################################################## | ||||
|      | ||||
|      | ||||
|     def __init__(self, username=None, password=None): | ||||
|         self.username = username | ||||
|         self.password = password | ||||
|         self.temp_dir = tempfile.mkdtemp(prefix=__appname__+'_') | ||||
|         self.browser = self.get_browser() | ||||
|         self.url = 'file:'+ ('' if iswindows else '//') + self.build_index() | ||||
|      | ||||
|     def __del__(self): | ||||
|         import os, shutil | ||||
|         if os.path.isdir(self.temp_dir): | ||||
|             shutil.rmtree(self.temp_dir) | ||||
|      | ||||
|     def build_index(self): | ||||
|         '''Build an RSS based index.html''' | ||||
|         import os | ||||
|         articles = self.parse_feeds() | ||||
|          | ||||
|      | ||||
|         def build_sub_index(title, items): | ||||
|             ilist = '' | ||||
|             li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\ | ||||
|                 u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n' | ||||
|             for item in items: | ||||
|                 ilist += li%item | ||||
|             return u'''\ | ||||
|             <html> | ||||
|             <body> | ||||
|             <h2>%(title)s</h2> | ||||
|             <ul> | ||||
|             %(items)s | ||||
|             </ul> | ||||
|             </body> | ||||
|             </html> | ||||
|             '''%dict(title=title, items=ilist.rstrip())         | ||||
|          | ||||
|         cnum = 0 | ||||
|         clist = '' | ||||
|         categories = articles.keys() | ||||
|         categories.sort() | ||||
|         for category in categories: | ||||
|             cnum  += 1 | ||||
|             cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html') | ||||
|             prefix = 'file:' if iswindows else '' | ||||
|             clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category) | ||||
|             src = build_sub_index(category, articles[category]) | ||||
|             open(cfile, 'wb').write(src.encode('utf-8'))         | ||||
|          | ||||
|         src = '''\ | ||||
|         <html> | ||||
|         <body> | ||||
|         <h1>%(title)s</h1> | ||||
|         <div style='text-align: right; font-weight: bold'>%(date)s</div> | ||||
|         <ul> | ||||
|         %(categories)s | ||||
|         </ul> | ||||
|         </body> | ||||
|         </html> | ||||
|         '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()),  | ||||
|                  categories=clist, title=self.title) | ||||
|         index = os.path.join(self.temp_dir, 'index.html') | ||||
|         open(index, 'wb').write(src.encode('utf-8')) | ||||
|         return index | ||||
| 
 | ||||
|      | ||||
|     def parse_feeds(self): | ||||
|         feeds = self.get_feeds() | ||||
|         articles = {} | ||||
|         for title, url in feeds: | ||||
|             try: | ||||
|                 src = self.browser.open(url).read() | ||||
|             except Exception, err: | ||||
|                 print 'Could not fetch feed: %s\nError: %s'%(url, err) | ||||
|                 continue | ||||
|              | ||||
|             articles[title] = [] | ||||
|             soup = BeautifulStoneSoup(src) | ||||
|             for item in soup.findAll('item'): | ||||
|                 try: | ||||
|                     pubdate = item.find('pubdate').string | ||||
|                     if not pubdate: | ||||
|                         continue | ||||
|                     pubdate = pubdate.replace('+0000', 'GMT') | ||||
|                     d = {  | ||||
|                         'title'    : item.find('title').string,                  | ||||
|                         'url'      : self.print_version(item.find('guid').string), | ||||
|                         'timestamp': calendar.timegm(self.strptime(pubdate)), | ||||
|                         'date'     : pubdate | ||||
|                         } | ||||
|                     delta = time.time() - d['timestamp'] | ||||
|                     if delta > self.oldest_article*3600*24: | ||||
|                         continue | ||||
|                       | ||||
|                 except Exception, err: | ||||
|                     continue | ||||
|                 try: | ||||
|                     desc = item.find('description') | ||||
|                     d['description'] = self.process_html_description(desc) if  self.html_description else desc.string                     | ||||
|                 except: | ||||
|                     d['description'] = '' | ||||
|                 articles[title].append(d) | ||||
|             articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True) | ||||
|             articles[title][self.max_articles_per_feed:] = [] | ||||
|             for item in articles[title]: | ||||
|                 item.pop('timestamp') | ||||
|             if not articles[title]: | ||||
|                 articles.pop(title) | ||||
|         return articles | ||||
| 
 | ||||
|      | ||||
|     @classmethod | ||||
|     def process_html_description(cls, tag): | ||||
|         src = '\n'.join(tag.contents) | ||||
|         replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] | ||||
|         for e in replaced_entities: | ||||
|             ent = '&'+e+';' | ||||
|             src = src.replace(ent, unichr(name2codepoint[e])) | ||||
|         return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src) | ||||
| 
 | ||||
|      | ||||
|     DAY_MAP   = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6) | ||||
|     MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12) | ||||
|     FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6,  | ||||
|                       July=7, August=8, September=9, October=10,  | ||||
|                       November=11, December=12) | ||||
|          | ||||
|     @classmethod | ||||
|     def strptime(cls, src): | ||||
|         src = src.strip().split() | ||||
|         src[0] = str(cls.DAY_MAP[src[0][:-1]])+',' | ||||
|         try: | ||||
|             src[2] = str(cls.MONTH_MAP[src[2]]) | ||||
|         except KeyError: | ||||
|             src[2] = str(cls.FULL_MONTH_MAP[src[2]]) | ||||
|         return time.strptime(' '.join(src), '%w, %d %m %Y %H:%M:%S %Z') | ||||
|      | ||||
|     def command_line_options(self): | ||||
|         args = [] | ||||
|         args.append('--max-recursions='+str(self.max_recursions)) | ||||
|         args.append('--delay='+str(self.delay)) | ||||
|         args.append('--max-files='+str(self.max_files)) | ||||
|         for i in self.match_regexps: | ||||
|             args.append('--match-regexp="'+i+'"') | ||||
|         for i in self.filter_regexps: | ||||
|             args.append('--filter-regexp="'+i+'"') | ||||
|         return args | ||||
|          | ||||
|      | ||||
							
								
								
									
										57
									
								
								src/libprs500/ebooks/lrf/web/profiles/bbc.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										57
									
								
								src/libprs500/ebooks/lrf/web/profiles/bbc.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,57 @@ | ||||
| ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net | ||||
| ##    This program is free software; you can redistribute it and/or modify | ||||
| ##    it under the terms of the GNU General Public License as published by | ||||
| ##    the Free Software Foundation; either version 2 of the License, or | ||||
| ##    (at your option) any later version. | ||||
| ## | ||||
| ##    This program is distributed in the hope that it will be useful, | ||||
| ##    but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| ##    GNU General Public License for more details. | ||||
| ## | ||||
| ##    You should have received a copy of the GNU General Public License along | ||||
| ##    with this program; if not, write to the Free Software Foundation, Inc., | ||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
| ''' | ||||
| Fetch the BBC. | ||||
| ''' | ||||
| import re | ||||
| 
 | ||||
| from libprs500.ebooks.lrf.web.profiles import DefaultProfile | ||||
| from libprs500.ebooks.BeautifulSoup import BeautifulSoup | ||||
| 
 | ||||
| class BBC(DefaultProfile): | ||||
|      | ||||
|     title = 'The BBC' | ||||
|     max_recursions = 2 | ||||
|     timefmt  = ' [%a, %d %b, %Y]' | ||||
|     no_stylesheets = True | ||||
|      | ||||
|     preprocess_regexps = \ | ||||
|         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  | ||||
|               [ | ||||
|                # Remove footer from individual stories | ||||
|                (r'<div class=.footer.>.*?Published',  | ||||
|                 lambda match : '<p></p><div class="footer">Published'), | ||||
|                # Add some style info in place of disabled stylesheet | ||||
|                (r'<link.*?type=.text/css.*?>', lambda match : | ||||
|                 '''<style type="text/css"> | ||||
|                     .headline {font-size: x-large;} | ||||
|                     .fact { padding-top: 10pt  } | ||||
|                     </style>'''), | ||||
|                ] | ||||
|                   ] | ||||
|      | ||||
|          | ||||
|     def print_version(self, url): | ||||
|         return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/') | ||||
|      | ||||
|     def get_feeds(self): | ||||
|         src = self.browser.open('http://news.bbc.co.uk/1/hi/help/3223484.stm').read() | ||||
|         soup = BeautifulSoup(src[src.index('<html'):]) | ||||
|         feeds = [] | ||||
|         ul =  soup.find('ul', attrs={'class':'rss'}) | ||||
|         for link in ul.findAll('a'): | ||||
|             feeds.append((link.string, link['href'])) | ||||
|         return feeds | ||||
| 
 | ||||
							
								
								
									
										81
									
								
								src/libprs500/ebooks/lrf/web/profiles/economist.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										81
									
								
								src/libprs500/ebooks/lrf/web/profiles/economist.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,81 @@ | ||||
| ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net | ||||
| ##    This program is free software; you can redistribute it and/or modify | ||||
| ##    it under the terms of the GNU General Public License as published by | ||||
| ##    the Free Software Foundation; either version 2 of the License, or | ||||
| ##    (at your option) any later version. | ||||
| ## | ||||
| ##    This program is distributed in the hope that it will be useful, | ||||
| ##    but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| ##    GNU General Public License for more details. | ||||
| ## | ||||
| ##    You should have received a copy of the GNU General Public License along | ||||
| ##    with this program; if not, write to the Free Software Foundation, Inc., | ||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
| ''' | ||||
| Fetch The Economist. | ||||
| ''' | ||||
| import re | ||||
| 
 | ||||
| from libprs500.ebooks.lrf.web.profiles import DefaultProfile | ||||
| from libprs500.ebooks.BeautifulSoup import BeautifulSoup | ||||
| 
 | ||||
| class Economist(DefaultProfile): | ||||
|      | ||||
|     title = 'The Economist' | ||||
|     timefmt = ' [%d %b %Y]' | ||||
|     max_recursions = 3 | ||||
|      | ||||
|     TITLES = [ | ||||
|           'The world this week', | ||||
|           'Letters', | ||||
|           'Briefings', | ||||
|           'Special reports', | ||||
|           'Britain', | ||||
|           'Europe', | ||||
|           'United States', | ||||
|           'The Americas', | ||||
|           'Middle East and Africa', | ||||
|           'Asia', | ||||
|           'International', | ||||
|           'Business', | ||||
|           'Finance and economics', | ||||
|           'Science and technology', | ||||
|           'Books and arts', | ||||
|           'Indicators' | ||||
|           ] | ||||
|      | ||||
|     preprocess_regexps = \ | ||||
|         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  | ||||
|             [ | ||||
|              # Remove advert | ||||
|              (r'<noscript.*?</noscript>', lambda match: ''), | ||||
|              ] | ||||
|             ] | ||||
|      | ||||
|     def __init__(self, username=None, password=None): | ||||
|         DefaultProfile.__init__(self, username, password) | ||||
|         self.browser = None # Needed as otherwise there are timeouts while fetching actual articles | ||||
|      | ||||
|     def print_version(self, url): | ||||
|         return url.replace('displaystory', 'PrinterFriendly').replace('&fsrc=RSS', '') | ||||
|      | ||||
|     def get_feeds(self): | ||||
|         src = self.browser.open('http://economist.com/rss/').read() | ||||
|         soup = BeautifulSoup(src) | ||||
|         feeds = [] | ||||
|         for ul in soup.findAll('ul'): | ||||
|             lis =  ul.findAll('li') | ||||
|             try: | ||||
|                 title, link = lis[0], lis[1] | ||||
|             except IndexError: | ||||
|                 continue | ||||
|             title = title.string | ||||
|             if title: | ||||
|                 title = title.strip() | ||||
|             if title not in self.__class__.TITLES: | ||||
|                 continue | ||||
|             a = link.find('a') | ||||
|             feeds.append((title, a['href'].strip())) | ||||
|              | ||||
|         return feeds | ||||
| @ -12,14 +12,27 @@ | ||||
| ##    You should have received a copy of the GNU General Public License along | ||||
| ##    with this program; if not, write to the Free Software Foundation, Inc., | ||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
| '''Logic to create a Newsweek HTML aggregator from RSS feeds''' | ||||
| ''' | ||||
| Profile to download Newsweek | ||||
| ''' | ||||
| from libprs500.ebooks.lrf.web.profiles import DefaultProfile | ||||
| 
 | ||||
| import tempfile, os, shutil | ||||
| 
 | ||||
| from libprs500.ebooks.lrf.web import build_index, parse_feeds | ||||
| from libprs500 import __appname__, iswindows, browser | ||||
| 
 | ||||
| RSS_FEEDS = [ | ||||
| class Newsweek(DefaultProfile): | ||||
|      | ||||
|     title = 'Newsweek' | ||||
|     max_recursions = 2 | ||||
|     timefmt  = ' [%d %b %Y]' | ||||
|     html_description = True | ||||
|     oldest_article        = 15 | ||||
|      | ||||
|          | ||||
|     def print_version(self, url): | ||||
|         if not url.endswith('/'): | ||||
|             url += '/' | ||||
|         return url + 'output/print' | ||||
|      | ||||
|     def get_feeds(self): | ||||
|         return [ | ||||
|              ('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',), | ||||
|              ('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'), | ||||
|              ('Politics', 'http://feeds.newsweek.com/headlines/politics'), | ||||
| @ -32,28 +45,5 @@ RSS_FEEDS = [ | ||||
|              ('Society', 'http://feeds.newsweek.com/newsweek/society'), | ||||
|              ('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'), | ||||
|              ] | ||||
| 
 | ||||
| 
 | ||||
| def print_version(url): | ||||
|     if '?' in url: | ||||
|         url = url[:url.index('?')] | ||||
|     if not url.endswith('/'): | ||||
|         url += '/' | ||||
|     return url + 'output/print' | ||||
| 
 | ||||
| def initialize(profile): | ||||
|     profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_') | ||||
|     profile['browser'] = browser() | ||||
|     articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version,  | ||||
|                            max_articles_per_feed=20, oldest_article=15,  | ||||
|                            html_description=True) | ||||
|     index = build_index('Newsweek', articles, profile['temp dir']) | ||||
|     profile['url'] = 'file:'+ ('' if iswindows else '//') + index | ||||
|     profile['timefmt'] = ' [%d %b %Y]' | ||||
|     profile['max_recursions'] =  2 | ||||
|     profile['title']          = 'Newsweek' | ||||
|     profile['url'] = 'file:'+ ('' if iswindows else '//') +index | ||||
| 
 | ||||
| def finalize(profile): | ||||
|     if os.path.isdir(profile['temp dir']): | ||||
|         shutil.rmtree(profile['temp dir']) | ||||
|          | ||||
|          | ||||
							
								
								
									
										66
									
								
								src/libprs500/ebooks/lrf/web/profiles/nytimes.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										66
									
								
								src/libprs500/ebooks/lrf/web/profiles/nytimes.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,66 @@ | ||||
| ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net | ||||
| ##    This program is free software; you can redistribute it and/or modify | ||||
| ##    it under the terms of the GNU General Public License as published by | ||||
| ##    the Free Software Foundation; either version 2 of the License, or | ||||
| ##    (at your option) any later version. | ||||
| ## | ||||
| ##    This program is distributed in the hope that it will be useful, | ||||
| ##    but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| ##    GNU General Public License for more details. | ||||
| ## | ||||
| ##    You should have received a copy of the GNU General Public License along | ||||
| ##    with this program; if not, write to the Free Software Foundation, Inc., | ||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
| ''' | ||||
| Profile to download the New York Times | ||||
| ''' | ||||
| import re | ||||
| 
 | ||||
| from libprs500.ebooks.lrf.web.profiles import DefaultProfile | ||||
| from libprs500.ebooks.BeautifulSoup import BeautifulSoup | ||||
| 
 | ||||
| class NYTimes(DefaultProfile): | ||||
|      | ||||
|     title   = 'The New York Times' | ||||
|     timefmt = ' [%a, %d %b, %Y]' | ||||
|     max_recursions = 2 | ||||
|      | ||||
|     preprocess_regexps = \ | ||||
|             [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  | ||||
|               [ | ||||
|                # Remove header bar | ||||
|                (r'(<body.*?>).*?<h1', lambda match: match.group(1)+'<h1'), | ||||
|                (r'<div class="articleTools">.*></ul>', lambda match : ''), | ||||
|                # Remove footer bar | ||||
|                (r'<\!--  end \#article -->.*', lambda match : '</body></html>'), | ||||
|                (r'<div id="footer">.*', lambda match : '</body></html>'), | ||||
|                ] | ||||
|               ] | ||||
|                | ||||
|     def browser(self): | ||||
|         br = DefaultProfile.browser(self) | ||||
|         if self.username is not None and self.password is not None: | ||||
|             br.open('http://www.nytimes.com/auth/login') | ||||
|             br.select_form(name='login') | ||||
|             br['USERID']   = self.username | ||||
|             br['PASSWORD'] = self.password | ||||
|             br.submit() | ||||
|         return br | ||||
|      | ||||
|     def get_feeds(self): | ||||
|         src = self.browser.open('http://www.nytimes.com/services/xml/rss/index.html').read() | ||||
|         soup = BeautifulSoup(src[src.index('<html'):]) | ||||
|         feeds = [] | ||||
|         for link in soup.findAll('link', attrs={'type':'application/rss+xml'}): | ||||
|             if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts',  | ||||
|                                      'Dining & Wine', 'Home & Garden', 'Multimedia', | ||||
|                                      'Most E-mailed Articles',  | ||||
|                                      'Automobiles', 'Fashion & Style', 'Television News', | ||||
|                                      'Education']: | ||||
|                 feeds.append((link['title'], link['href']))             | ||||
|          | ||||
|         return feeds | ||||
|      | ||||
|     def print_version(self, url): | ||||
|         return url + '?&pagewanted=print' | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user