mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-31 02:27:01 -04:00 
			
		
		
		
	Add support for user profiles in web2lrf
This commit is contained in:
		
							parent
							
								
									275b59a2e7
								
							
						
					
					
						commit
						8799a6f3f2
					
				| @ -13,134 +13,3 @@ | |||||||
| ##    with this program; if not, write to the Free Software Foundation, Inc., | ##    with this program; if not, write to the Free Software Foundation, Inc., | ||||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
| 
 | 
 | ||||||
| import os, time, calendar, operator, re |  | ||||||
| 
 |  | ||||||
| from libprs500 import iswindows |  | ||||||
| from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup |  | ||||||
| from htmlentitydefs import name2codepoint |  | ||||||
| 
 |  | ||||||
| DAY_MAP   = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6) |  | ||||||
| MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12) |  | ||||||
| FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6,  |  | ||||||
|                       July=7, August=8, September=9, October=10,  |  | ||||||
|                       November=11, December=12) |  | ||||||
| 
 |  | ||||||
| def strptime(src): |  | ||||||
|     src = src.strip().split() |  | ||||||
|     src[0] = str(DAY_MAP[src[0][:-1]])+',' |  | ||||||
|     try: |  | ||||||
|         src[2] = str(MONTH_MAP[src[2]]) |  | ||||||
|     except KeyError: |  | ||||||
|         src[2] = str(FULL_MONTH_MAP[src[2]]) |  | ||||||
|     return time.strptime(' '.join(src), '%w, %d %m %Y %H:%M:%S %Z') |  | ||||||
| 
 |  | ||||||
| def process_html_description(tag): |  | ||||||
|         src = '\n'.join(tag.contents) |  | ||||||
|         replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] |  | ||||||
|         for e in replaced_entities: |  | ||||||
|             ent = '&'+e+';' |  | ||||||
|             src = src.replace(ent, unichr(name2codepoint[e])) |  | ||||||
|         return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src) |  | ||||||
| 
 |  | ||||||
| def parse_feeds(feeds, browser, print_version,  |  | ||||||
|                 max_articles_per_feed=10,  |  | ||||||
|                 html_description=False, |  | ||||||
|                 oldest_article=7): |  | ||||||
|     ''' |  | ||||||
|     @param print_version: Callable that takes a url string and returns the url to  |  | ||||||
|                           printable version of the article pointed to by the original url. |  | ||||||
|     @param max_articles_per_feed: Maximum number of articles to download from each feed |  | ||||||
|     @param html_description: If true the atricles descriptions are processed as HTML |  | ||||||
|     @param oldest_article: A number in days. No articles older than now - oldest_aticle  |  | ||||||
|                            will be downloaded.   |  | ||||||
|     ''' |  | ||||||
|     articles = {} |  | ||||||
|     for title, url in feeds: |  | ||||||
|         try: |  | ||||||
|             src = browser.open(url).read() |  | ||||||
|         except Exception, err: |  | ||||||
|             print 'Could not fetch feed: %s\nError: %s'%(url, err) |  | ||||||
|             continue |  | ||||||
|          |  | ||||||
|         articles[title] = [] |  | ||||||
|         soup = BeautifulStoneSoup(src) |  | ||||||
|         for item in soup.findAll('item'): |  | ||||||
|             try: |  | ||||||
|                 pubdate = item.find('pubdate').string |  | ||||||
|                 if not pubdate: |  | ||||||
|                     continue |  | ||||||
|                 pubdate = pubdate.replace('+0000', 'GMT') |  | ||||||
|                 d = {  |  | ||||||
|                     'title'    : item.find('title').string,                  |  | ||||||
|                     'url'      : print_version(item.find('guid').string), |  | ||||||
|                     'timestamp': calendar.timegm(strptime(pubdate)), |  | ||||||
|                     'date'     : pubdate |  | ||||||
|                     } |  | ||||||
|                 delta = time.time() - d['timestamp'] |  | ||||||
|                 if delta > oldest_article*3600*24: |  | ||||||
|                     continue |  | ||||||
|                   |  | ||||||
|             except Exception, err: |  | ||||||
|                 continue |  | ||||||
|             try: |  | ||||||
|                 desc = item.find('description') |  | ||||||
|                 d['description'] = process_html_description(desc) if  html_description else desc.string                     |  | ||||||
|             except: |  | ||||||
|                 d['description'] = '' |  | ||||||
|             articles[title].append(d) |  | ||||||
|         articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True) |  | ||||||
|         articles[title][max_articles_per_feed:] = [] |  | ||||||
|         for item in articles[title]: |  | ||||||
|             item.pop('timestamp') |  | ||||||
|         if not articles[title]: |  | ||||||
|             articles.pop(title) |  | ||||||
|     return articles |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def build_index(title, articles, dir): |  | ||||||
|     '''Build an RSS based index.html''' |  | ||||||
| 
 |  | ||||||
|     def build_sub_index(title, items): |  | ||||||
|         ilist = '' |  | ||||||
|         li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\ |  | ||||||
|             u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n' |  | ||||||
|         for item in items: |  | ||||||
|             ilist += li%item |  | ||||||
|         return u'''\ |  | ||||||
|         <html> |  | ||||||
|         <body> |  | ||||||
|         <h2>%(title)s</h2> |  | ||||||
|         <ul> |  | ||||||
|         %(items)s |  | ||||||
|         </ul> |  | ||||||
|         </body> |  | ||||||
|         </html> |  | ||||||
|         '''%dict(title=title, items=ilist.rstrip())         |  | ||||||
|      |  | ||||||
|     cnum = 0 |  | ||||||
|     clist = '' |  | ||||||
|     categories = articles.keys() |  | ||||||
|     categories.sort() |  | ||||||
|     for category in categories: |  | ||||||
|         cnum  += 1 |  | ||||||
|         cfile = os.path.join(dir, 'category'+str(cnum)+'.html') |  | ||||||
|         prefix = 'file:' if iswindows else '' |  | ||||||
|         clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category) |  | ||||||
|         src = build_sub_index(category, articles[category]) |  | ||||||
|         open(cfile, 'wb').write(src.encode('utf-8'))         |  | ||||||
|      |  | ||||||
|     src = '''\ |  | ||||||
|     <html> |  | ||||||
|     <body> |  | ||||||
|     <h1>%(title)s</h1> |  | ||||||
|     <div style='text-align: right; font-weight: bold'>%(date)s</div> |  | ||||||
|     <ul> |  | ||||||
|     %(categories)s |  | ||||||
|     </ul> |  | ||||||
|     </body> |  | ||||||
|     </html> |  | ||||||
|     '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()),  |  | ||||||
|              categories=clist, title=title) |  | ||||||
|     index = os.path.join(dir, 'index.html') |  | ||||||
|     open(index, 'wb').write(src.encode('utf-8')) |  | ||||||
|     return index |  | ||||||
|  | |||||||
| @ -1,53 +0,0 @@ | |||||||
| ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net |  | ||||||
| ##    This program is free software; you can redistribute it and/or modify |  | ||||||
| ##    it under the terms of the GNU General Public License as published by |  | ||||||
| ##    the Free Software Foundation; either version 2 of the License, or |  | ||||||
| ##    (at your option) any later version. |  | ||||||
| ## |  | ||||||
| ##    This program is distributed in the hope that it will be useful, |  | ||||||
| ##    but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| ##    GNU General Public License for more details. |  | ||||||
| ## |  | ||||||
| ##    You should have received a copy of the GNU General Public License along |  | ||||||
| ##    with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| import tempfile, shutil, os |  | ||||||
| from libprs500.ebooks.lrf.web import build_index, parse_feeds |  | ||||||
| 
 |  | ||||||
| RSS = 'http://news.bbc.co.uk/1/hi/help/3223484.stm' |  | ||||||
| 
 |  | ||||||
| from libprs500 import __appname__, iswindows, browser |  | ||||||
| from libprs500.ebooks.BeautifulSoup import BeautifulSoup |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def get_feeds(browser): |  | ||||||
|     src = browser.open(RSS).read() |  | ||||||
|     soup = BeautifulSoup(src[src.index('<html'):]) |  | ||||||
|     feeds = [] |  | ||||||
|     ul =  soup.find('ul', attrs={'class':'rss'}) |  | ||||||
|     for link in ul.findAll('a'): |  | ||||||
|         feeds.append((link.string, link['href'])) |  | ||||||
|     return feeds |  | ||||||
| 
 |  | ||||||
| def initialize(profile): |  | ||||||
|     profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_') |  | ||||||
|     profile['browser'] = browser() |  | ||||||
|     feeds = get_feeds(profile['browser']) |  | ||||||
|     articles = parse_feeds(feeds, profile['browser'], lambda x: x.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/')) |  | ||||||
|     index = build_index('The BBC', articles, profile['temp dir']) |  | ||||||
|     profile['url'] = 'file:'+ ('' if iswindows else '//') + index |  | ||||||
|     profile['timefmt'] = ' [%a, %d %b, %Y]' |  | ||||||
|     profile['max_recursions'] =  2                  |  | ||||||
|     profile['title']          = 'The BBC' |  | ||||||
|     profile['no_stylesheets'] = True |  | ||||||
|      |  | ||||||
| def finalize(profile): |  | ||||||
|     if os.path.isdir(profile['temp dir']): |  | ||||||
|         shutil.rmtree(profile['temp dir']) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|      |  | ||||||
| @ -14,43 +14,48 @@ | |||||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
| '''Convert known websites into LRF files.''' | '''Convert known websites into LRF files.''' | ||||||
| 
 | 
 | ||||||
| import sys, time, tempfile, shutil, os, logging | import sys, time, tempfile, shutil, os, logging, imp, inspect | ||||||
| from urlparse import urlsplit | from urlparse import urlsplit | ||||||
| 
 | 
 | ||||||
| from libprs500 import __appname__, setup_cli_handlers, CommandLineError | from libprs500 import __appname__, setup_cli_handlers, CommandLineError | ||||||
| from libprs500.ebooks.lrf import option_parser as lrf_option_parser | from libprs500.ebooks.lrf import option_parser as lrf_option_parser | ||||||
| from libprs500.ebooks.lrf.html.convert_from import process_file | from libprs500.ebooks.lrf.html.convert_from import process_file | ||||||
| from libprs500.ebooks.lrf.web.profiles import profiles | 
 | ||||||
| from libprs500.web.fetch.simple import create_fetcher | from libprs500.web.fetch.simple import create_fetcher | ||||||
| 
 | 
 | ||||||
| available_profiles = profiles.keys() | from libprs500.ebooks.lrf.web.profiles import DefaultProfile | ||||||
| available_profiles.remove('default') | from libprs500.ebooks.lrf.web.profiles.nytimes import NYTimes | ||||||
| available_profiles = ' '.join(available_profiles) | from libprs500.ebooks.lrf.web.profiles.bbc import BBC | ||||||
|  | from libprs500.ebooks.lrf.web.profiles.newsweek import Newsweek | ||||||
|  | 
 | ||||||
|  | builtin_profiles   = [NYTimes, BBC, Newsweek] | ||||||
|  | available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles]  | ||||||
| 
 | 
 | ||||||
| def option_parser(): | def option_parser(): | ||||||
|     parser = lrf_option_parser(usage='''%prog [options] website_profile\n\n''' |     parser = lrf_option_parser(usage='''%prog [options] website_profile\n\n''' | ||||||
|                           '''%prog downloads a site from the web and converts it ''' |                           '''%prog downloads a site from the web and converts it ''' | ||||||
|                           '''into a LRF file for use with the SONY Reader. ''' |                           '''into a LRF file for use with the SONY Reader. ''' | ||||||
|                           '''website_profile is one of '''+available_profiles+\ |                           '''website_profile is one of '''+str(available_profiles)+\ | ||||||
|                           ''' If you specify a website_profile of default or do not specify ''' |                           ''' If you specify a website_profile of default or do not specify ''' | ||||||
|                           '''it, you must specify the --url option.''' |                           '''it, you must specify the --url option.''' | ||||||
|                           ) |                           ) | ||||||
|      |      | ||||||
|     parser.add_option('-u', '--url', dest='url', default=None,   |     parser.add_option('-u', '--url', dest='url', default=None,   | ||||||
|                       help='The URL to download. You only need to specify this if you are not specifying a website_profile.') |                       help='The URL to download. You only need to specify this if you are not specifying a website_profile.') | ||||||
|      |     parser.add_option('--user-profile', default=None, | ||||||
|  |                       help='Path to a python file containing a user created profile.') | ||||||
|     parser.add_option('--username', dest='username', default=None,  |     parser.add_option('--username', dest='username', default=None,  | ||||||
|                       help='Specify the username to be used while downloading. Only used if the profile supports it.') |                       help='Specify the username to be used while downloading. Only used if the profile supports it.') | ||||||
|     parser.add_option('--password', dest='password', default=None, |     parser.add_option('--password', dest='password', default=None, | ||||||
|                       help='Specify the password to be used while downloading. Only used if the profile supports it.') |                       help='Specify the password to be used while downloading. Only used if the profile supports it.') | ||||||
|     parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %(timeout)s s'%profiles['default'], |     parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %d s'%DefaultProfile.timeout, | ||||||
|                       default=None, type='int', dest='timeout') |                       default=None, type='int', dest='timeout') | ||||||
|     parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %(max_recursions)s'%profiles['default'], |     parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %d'%DefaultProfile.timeout, | ||||||
|                       default=None, type='int', dest='max_recursions') |                       default=None, type='int', dest='max_recursions') | ||||||
|     parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files', |     parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files', | ||||||
|                       help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %(max_files)s'%profiles['default']) |                       help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %d'%DefaultProfile.timeout) | ||||||
|     parser.add_option('--delay', default=None, dest='delay', type='int', |     parser.add_option('--delay', default=None, dest='delay', type='int', | ||||||
|                       help='Minimum interval in seconds between consecutive fetches. Default is %(delay)s s'%profiles['default']) |                       help='Minimum interval in seconds between consecutive fetches. Default is %d s'%DefaultProfile.timeout) | ||||||
|     parser.add_option('--dont-download-stylesheets', action='store_true', default=None, |     parser.add_option('--dont-download-stylesheets', action='store_true', default=None, | ||||||
|                       help='Do not download CSS stylesheets.', dest='no_stylesheets')     |                       help='Do not download CSS stylesheets.', dest='no_stylesheets')     | ||||||
|      |      | ||||||
| @ -85,45 +90,58 @@ def process_profile(args, options, logger=None): | |||||||
|             level = logging.DEBUG if options.verbose else logging.INFO |             level = logging.DEBUG if options.verbose else logging.INFO | ||||||
|             logger = logging.getLogger('web2lrf') |             logger = logging.getLogger('web2lrf') | ||||||
|             setup_cli_handlers(logger, level) |             setup_cli_handlers(logger, level) | ||||||
|  |         index = -1 | ||||||
|  |         if options.user_profile is not None: | ||||||
|  |             path = os.path.abspath(options.user_profile) | ||||||
|  |             name = os.path.splitext(os.path.basename(path))[0] | ||||||
|  |             res = imp.find_module(name, [os.path.dirname(path)]) | ||||||
|  |             module =  imp.load_module(name, *res) | ||||||
|  |             classes = inspect.getmembers(module,  | ||||||
|  |                 lambda x : inspect.isclass(x) and issubclass(x, DefaultProfile)\ | ||||||
|  |                            and x is not DefaultProfile) | ||||||
|  |             if not classes: | ||||||
|  |                 raise CommandLineError('Invalid user profile '+path) | ||||||
|  |             builtin_profiles.append(classes[0][1]) | ||||||
|  |             available_profiles.append(name) | ||||||
|  |             if len(args) < 2: | ||||||
|  |                 args.append('') | ||||||
|  |             args[1] = name | ||||||
|         if len(args) == 2: |         if len(args) == 2: | ||||||
|             if not profiles.has_key(args[1]): |             try: | ||||||
|                 raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys())) |                 index = available_profiles.index(args[1]) | ||||||
|         profile = profiles[args[1]] if len(args) == 2 else profiles['default'] |             except ValueError: | ||||||
|         profile['username'] = options.username |                 raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], available_profiles)) | ||||||
|         profile['password'] = options.password |         profile = DefaultProfile if index == -1 else builtin_profiles[index] | ||||||
|         if profile.has_key('initialize'): |         profile = profile(options.username, options.password) | ||||||
|             profile['initialize'](profile) |         if profile.browser is not None: | ||||||
|         if profile.has_key('browser'): |             options.browser = profile.browser | ||||||
|             options.browser = profile['browser'] |  | ||||||
|          |          | ||||||
|         for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'): |         for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'): | ||||||
|             val = getattr(options, opt) |             val = getattr(options, opt) | ||||||
|             if val is None: |             if val is None: | ||||||
|                 setattr(options, opt, profile[opt]) |                 setattr(options, opt, getattr(profile, opt)) | ||||||
|          |          | ||||||
|         if not options.url: |         if not options.url: | ||||||
|             options.url = profile['url']             |             options.url = profile.url             | ||||||
|          |          | ||||||
|         if not options.url: |         if not options.url: | ||||||
|             raise CommandLineError('You must specify the --url option or a profile from one of: %s'%(available_profiles,)) |             raise CommandLineError('You must specify the --url option or a profile from one of: %s'%(available_profiles,)) | ||||||
|          |          | ||||||
|         if not options.title: |         if not options.title: | ||||||
|             title = profile['title'] |             title = profile.title | ||||||
|             if not title: |             if not title: | ||||||
|                 title = urlsplit(options.url).netloc |                 title = urlsplit(options.url).netloc | ||||||
|             options.title = title + time.strftime(profile['timefmt'], time.localtime()) |             options.title = title + time.strftime(profile.timefmt, time.localtime()) | ||||||
|          |          | ||||||
|         options.match_regexps += profile['match_regexps'] |         options.match_regexps += profile.match_regexps | ||||||
|         options.preprocess_regexps = profile['preprocess_regexps'] |         options.preprocess_regexps = profile.preprocess_regexps | ||||||
|         options.filter_regexps += profile['filter_regexps'] |         options.filter_regexps += profile.filter_regexps | ||||||
|         if len(args) == 2 and args[1] != 'default': |         if len(args) == 2 and args[1] != 'default': | ||||||
|             options.anchor_ids = False |             options.anchor_ids = False | ||||||
|          |          | ||||||
|         htmlfile, tdir = fetch_website(options, logger) |         htmlfile, tdir = fetch_website(options, logger) | ||||||
|         create_lrf(htmlfile, options, logger) |         create_lrf(htmlfile, options, logger) | ||||||
|     finally: |     finally: | ||||||
|         if profile.has_key('finalize'): |  | ||||||
|             profile['finalize'](profile) |  | ||||||
|         if tdir and os.path.isdir(tdir): |         if tdir and os.path.isdir(tdir): | ||||||
|             shutil.rmtree(tdir) |             shutil.rmtree(tdir) | ||||||
|      |      | ||||||
|  | |||||||
| @ -1,81 +0,0 @@ | |||||||
| ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net |  | ||||||
| ##    This program is free software; you can redistribute it and/or modify |  | ||||||
| ##    it under the terms of the GNU General Public License as published by |  | ||||||
| ##    the Free Software Foundation; either version 2 of the License, or |  | ||||||
| ##    (at your option) any later version. |  | ||||||
| ## |  | ||||||
| ##    This program is distributed in the hope that it will be useful, |  | ||||||
| ##    but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| ##    GNU General Public License for more details. |  | ||||||
| ## |  | ||||||
| ##    You should have received a copy of the GNU General Public License along |  | ||||||
| ##    with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| import tempfile, shutil, os |  | ||||||
| from libprs500.ebooks.lrf.web import build_index, parse_feeds |  | ||||||
| 
 |  | ||||||
| RSS = 'http://economist.com/rss/' |  | ||||||
| TITLES = [ |  | ||||||
|           'The world this week', |  | ||||||
|           'Letters', |  | ||||||
|           'Briefings', |  | ||||||
|           'Special reports', |  | ||||||
|           'Britain', |  | ||||||
|           'Europe', |  | ||||||
|           'United States', |  | ||||||
|           'The Americas', |  | ||||||
|           'Middle East and Africa', |  | ||||||
|           'Asia', |  | ||||||
|           'International', |  | ||||||
|           'Business', |  | ||||||
|           'Finance and economics', |  | ||||||
|           'Science and technology', |  | ||||||
|           'Books and arts', |  | ||||||
|           'Indicators' |  | ||||||
|           ] |  | ||||||
| 
 |  | ||||||
| from libprs500 import __appname__, iswindows, browser |  | ||||||
| from libprs500.ebooks.BeautifulSoup import BeautifulSoup |  | ||||||
| 
 |  | ||||||
| def print_version(url): |  | ||||||
|     return url.replace('displaystory', 'PrinterFriendly').replace('&fsrc=RSS', '') |  | ||||||
| 
 |  | ||||||
| def get_feeds(browser): |  | ||||||
|     src = browser.open(RSS).read() |  | ||||||
|     soup = BeautifulSoup(src) |  | ||||||
|     feeds = [] |  | ||||||
|     for ul in soup.findAll('ul'): |  | ||||||
|         lis =  ul.findAll('li') |  | ||||||
|         try: |  | ||||||
|             title, link = lis[0], lis[1] |  | ||||||
|         except IndexError: |  | ||||||
|             continue |  | ||||||
|         title = title.string |  | ||||||
|         if title: |  | ||||||
|             title = title.strip() |  | ||||||
|         if title not in TITLES: |  | ||||||
|             continue |  | ||||||
|         a = link.find('a') |  | ||||||
|         feeds.append((title, a['href'].strip())) |  | ||||||
|          |  | ||||||
|     return feeds |  | ||||||
|              |  | ||||||
| def initialize(profile): |  | ||||||
|     profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_') |  | ||||||
|     profile['browser'] = browser() |  | ||||||
|     feeds = get_feeds(profile['browser']) |  | ||||||
|     articles = parse_feeds(feeds, profile['browser'], print_version, max_articles_per_feed=20) |  | ||||||
|     index = build_index('The Economist', articles, profile['temp dir']) |  | ||||||
|     profile['url'] = 'file:'+ ('' if iswindows else '//') + index |  | ||||||
|     profile['timefmt'] = ' [%d %b %Y]' |  | ||||||
|     profile['max_recursions'] =  3                 |  | ||||||
|     profile['title']          = 'The Economist' |  | ||||||
|     profile.pop('browser') # Needed as for some reason using the same browser instance causes timeouts |  | ||||||
|          |  | ||||||
| def finalize(profile): |  | ||||||
|     if os.path.isdir(profile['temp dir']): |  | ||||||
|         shutil.rmtree(profile['temp dir']) |  | ||||||
|      |  | ||||||
| @ -1,73 +0,0 @@ | |||||||
| ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net |  | ||||||
| ##    This program is free software; you can redistribute it and/or modify |  | ||||||
| ##    it under the terms of the GNU General Public License as published by |  | ||||||
| ##    the Free Software Foundation; either version 2 of the License, or |  | ||||||
| ##    (at your option) any later version. |  | ||||||
| ## |  | ||||||
| ##    This program is distributed in the hope that it will be useful, |  | ||||||
| ##    but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| ##    GNU General Public License for more details. |  | ||||||
| ## |  | ||||||
| ##    You should have received a copy of the GNU General Public License along |  | ||||||
| ##    with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
| '''New York Times from RSS feeds.''' |  | ||||||
| import os, tempfile, shutil |  | ||||||
| 
 |  | ||||||
| from libprs500 import __appname__, iswindows, browser |  | ||||||
| from libprs500.ebooks.BeautifulSoup import BeautifulSoup |  | ||||||
| from libprs500.ebooks.lrf.web import build_index, parse_feeds |  | ||||||
| 
 |  | ||||||
| RSS = 'http://www.nytimes.com/services/xml/rss/index.html' |  | ||||||
| LOGIN = 'http://www.nytimes.com/auth/login' |  | ||||||
| 
 |  | ||||||
| def get_feeds(browser): |  | ||||||
|     src = browser.open(RSS).read() |  | ||||||
|     soup = BeautifulSoup(src[src.index('<html'):]) |  | ||||||
|     feeds = [] |  | ||||||
|     for link in soup.findAll('link', attrs={'type':'application/rss+xml'}): |  | ||||||
|         if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts',  |  | ||||||
|                                  'Dining & Wine', 'Home & Garden', 'Multimedia', |  | ||||||
|                                  'Most E-mailed Articles',  |  | ||||||
|                                  'Automobiles', 'Fashion & Style', 'Television News', |  | ||||||
|                                  'Education']: |  | ||||||
|             feeds.append((link['title'], link['href'])) |  | ||||||
|         #else: print link['title'] |  | ||||||
|      |  | ||||||
|     return feeds |  | ||||||
| 
 |  | ||||||
| def initialize(profile): |  | ||||||
|     profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_') |  | ||||||
|     profile['browser'] = login(profile) |  | ||||||
|     feeds = get_feeds(profile['browser']) |  | ||||||
|     articles = parse_feeds(feeds, profile['browser'], lambda x: x + '?&pagewanted=print', |  | ||||||
|                            oldest_article=2) |  | ||||||
|     index = build_index('The New York Times', articles, profile['temp dir']) |  | ||||||
|     profile['url'] = 'file:'+ ('' if iswindows else '//') + index |  | ||||||
|     profile['timefmt'] = ' [%a, %d %b, %Y]' |  | ||||||
|     profile['max_recursions'] =  2                  |  | ||||||
|     profile['title']          = 'The New York Times' |  | ||||||
|      |  | ||||||
|      |  | ||||||
| def finalize(profile): |  | ||||||
|     if os.path.isdir(profile['temp dir']): |  | ||||||
|         shutil.rmtree(profile['temp dir']) |  | ||||||
|   |  | ||||||
| 
 |  | ||||||
| def login(profile): |  | ||||||
|     br = browser() |  | ||||||
|     if profile['username'] and profile['password']: |  | ||||||
|         br.open(LOGIN) |  | ||||||
|         br.select_form(name='login') |  | ||||||
|         br['USERID']   = profile['username'] |  | ||||||
|         br['PASSWORD'] = profile['password'] |  | ||||||
|         br.submit() |  | ||||||
|     return br |  | ||||||
|        |  | ||||||
| 
 |  | ||||||
| if __name__ == '__main__': |  | ||||||
|     feeds = get_feeds() |  | ||||||
|     articles = parse_feeds(feeds) |  | ||||||
|     print articles |  | ||||||
| 
 |  | ||||||
| @ -1,136 +0,0 @@ | |||||||
| ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net |  | ||||||
| ##    This program is free software; you can redistribute it and/or modify |  | ||||||
| ##    it under the terms of the GNU General Public License as published by |  | ||||||
| ##    the Free Software Foundation; either version 2 of the License, or |  | ||||||
| ##    (at your option) any later version. |  | ||||||
| ## |  | ||||||
| ##    This program is distributed in the hope that it will be useful, |  | ||||||
| ##    but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| ##    GNU General Public License for more details. |  | ||||||
| ## |  | ||||||
| ##    You should have received a copy of the GNU General Public License along |  | ||||||
| ##    with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
| '''Profiles for known websites.''' |  | ||||||
| import re |  | ||||||
| 
 |  | ||||||
| from libprs500.ebooks.lrf.web.newsweek import initialize as newsweek_initialize |  | ||||||
| from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize |  | ||||||
| from libprs500.ebooks.lrf.web.nytimes import initialize as nytimes_initialize |  | ||||||
| from libprs500.ebooks.lrf.web.nytimes import finalize as nytimes_finalize |  | ||||||
| from libprs500.ebooks.lrf.web.bbc import initialize as bbc_initialize |  | ||||||
| from libprs500.ebooks.lrf.web.bbc import finalize as bbc_finalize |  | ||||||
| from libprs500.ebooks.lrf.web.economist import initialize as economist_initialize |  | ||||||
| from libprs500.ebooks.lrf.web.economist import finalize as economist_finalize |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| profiles = { |  | ||||||
|             'default' : { |  | ||||||
|                          'url'               : '',    # The URL of the website |  | ||||||
|                          'title'             : '',    # The title to use for the LRF file |  | ||||||
|                          'max_recursions'    : 1,     # Number of levels of links to follow |  | ||||||
|                          'max_files'         : 1000,  # Maximum number of files to download |  | ||||||
|                          'delay'             : 0,     # Delay between consecutive downloads |  | ||||||
|                          'timeout'           : 10,    # Timeout for fetching files from server in seconds |  | ||||||
|                          'timefmt'           : ' [%a %d %b %Y]', |  | ||||||
|                          'no_stylesheets'    : False, # Download stylesheets  |  | ||||||
|                          'match_regexps'     : [],    # List of regular expressions that determines which links to follow |  | ||||||
|                          'filter_regexps'    : [],    # List of regular expressions that determines which links to ignore |  | ||||||
|                          # Only one of match_regexps or filter_regexps should be defined |  | ||||||
|                          'html2lrf_options'  : [],    # List of options to pass to html2lrf |  | ||||||
|                          'preprocess_regexps': [],    # List of regexp substitution rules to run on the downloaded HTML before running html2lrf |  | ||||||
|                          # See the profiles below for examples of these settings.  |  | ||||||
|                        }, |  | ||||||
|                         |  | ||||||
|             'nytimes' : { |  | ||||||
|                          'initialize'          : nytimes_initialize, |  | ||||||
|                          'finalize'            : nytimes_finalize, |  | ||||||
|                           |  | ||||||
|                          'preprocess_regexps' : |  | ||||||
|                          [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  |  | ||||||
|                           [ |  | ||||||
|                            # Remove header bar |  | ||||||
|                            (r'(<body.*?>).*?<h1', lambda match: match.group(1)+'<h1'), |  | ||||||
|                            (r'<div class="articleTools">.*></ul>', lambda match : ''), |  | ||||||
|                            # Remove footer bar |  | ||||||
|                            (r'<\!--  end \#article -->.*', lambda match : '</body></html>'), |  | ||||||
|                            (r'<div id="footer">.*', lambda match : '</body></html>'), |  | ||||||
|                            ] |  | ||||||
|                           ], |  | ||||||
|                          }, |  | ||||||
|                           |  | ||||||
|             'bbc'     : { |  | ||||||
|                           'initialize'          : bbc_initialize, |  | ||||||
|                           'finalize'            : bbc_finalize, |  | ||||||
|                           'preprocess_regexps' : |  | ||||||
|                          [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  |  | ||||||
|                           [ |  | ||||||
|                            # Remove footer from individual stories |  | ||||||
|                            (r'<div class=.footer.>.*?Published',  |  | ||||||
|                             lambda match : '<p></p><div class="footer">Published'), |  | ||||||
|                            # Add some style info in place of disabled stylesheet |  | ||||||
|                            (r'<link.*?type=.text/css.*?>', lambda match : |  | ||||||
|                             '''<style type="text/css"> |  | ||||||
|                                 .headline {font-size: x-large;} |  | ||||||
|                                 .fact { padding-top: 10pt  } |  | ||||||
|                                 </style>'''), |  | ||||||
|                            ] |  | ||||||
|                           ], |  | ||||||
|                           }, |  | ||||||
|              |  | ||||||
|             'newsweek' : { |  | ||||||
|                           'initialize'          : newsweek_initialize, |  | ||||||
|                           'finalize'            : newsweek_finalize, |  | ||||||
|                           'preprocess_regexps'  : |  | ||||||
|                          [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  |  | ||||||
|                           [ |  | ||||||
|                            # Make fonts larger |  | ||||||
|                            (r'<style.*?\.copyright.*?</style>',  |  | ||||||
|                             lambda match : \ |  | ||||||
|                         '''<style type="text/css">''' |  | ||||||
|                         '''updateTime{font:small Arial;color:#000000;}''' |  | ||||||
|                         '''.credit{font:small Arial;color:#999999;}''' |  | ||||||
|                         '''.head{font:bold 18pt x-large;color:#CC0000;}''' |  | ||||||
|                         '''.abstract{font:14pt large Verdana;color:#000000;}''' |  | ||||||
|                         '''.title{font:bold;color:#000000;}''' |  | ||||||
|                         '''.source{font:bold small Verdana;color:#CC0000;}''' |  | ||||||
|                         '''.footerLink{font:bold Verdana;color:#000000;}''' |  | ||||||
|                         '''.caption{font: Verdana;color:#000000;}''' |  | ||||||
|                         '''.textBodyBlack, .copyright{font: Verdana;color:#000000;}''' |  | ||||||
|                         '''.copyright{font-style:italic;}''' |  | ||||||
|                         '''</style>''' |  | ||||||
|                             ), |  | ||||||
|                            ] |  | ||||||
|                           ], |  | ||||||
|                           },  |  | ||||||
|                            |  | ||||||
|             'economist' : { |  | ||||||
|                            'initialize'          : economist_initialize, |  | ||||||
|                            'finalize'            : economist_finalize, |  | ||||||
|                            'preprocess_regexps' : |  | ||||||
|                            [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  |  | ||||||
|                             [ |  | ||||||
|                              # Remove advert |  | ||||||
|                              (r'<noscript.*?</noscript>', lambda match: ''), |  | ||||||
|                              ] |  | ||||||
|                             ],  |  | ||||||
|                            },                                    |  | ||||||
|             } |  | ||||||
| 
 |  | ||||||
| for key in profiles.keys(): |  | ||||||
|     if key == 'default': |  | ||||||
|         continue |  | ||||||
|     newd = profiles['default'].copy() |  | ||||||
|     newd.update(profiles[key]) |  | ||||||
|     profiles[key] = newd |  | ||||||
| 
 |  | ||||||
| def profile_to_command_line_options(profile): |  | ||||||
|     args = [] |  | ||||||
|     args.append('--max-recursions='+str(profile['max_recursions'])) |  | ||||||
|     args.append('--delay='+str(profile['delay'])) |  | ||||||
|     for i in profile['match_regexps']: |  | ||||||
|         args.append('--match-regexp="'+i+'"') |  | ||||||
|     for i in profile['filter_regexps']: |  | ||||||
|         args.append('--filter-regexp="'+i+'"') |  | ||||||
|     return args |  | ||||||
							
								
								
									
										227
									
								
								src/libprs500/ebooks/lrf/web/profiles/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										227
									
								
								src/libprs500/ebooks/lrf/web/profiles/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,227 @@ | |||||||
|  | ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net | ||||||
|  | ##    This program is free software; you can redistribute it and/or modify | ||||||
|  | ##    it under the terms of the GNU General Public License as published by | ||||||
|  | ##    the Free Software Foundation; either version 2 of the License, or | ||||||
|  | ##    (at your option) any later version. | ||||||
|  | ## | ||||||
|  | ##    This program is distributed in the hope that it will be useful, | ||||||
|  | ##    but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  | ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  | ##    GNU General Public License for more details. | ||||||
|  | ## | ||||||
|  | ##    You should have received a copy of the GNU General Public License along | ||||||
|  | ##    with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  | ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  | ''' | ||||||
|  | ''' | ||||||
|  | 
 | ||||||
|  | import tempfile, time, calendar, re, operator | ||||||
|  | from htmlentitydefs import name2codepoint | ||||||
|  | 
 | ||||||
|  | from libprs500 import __appname__, iswindows, browser | ||||||
|  | from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class DefaultProfile(object): | ||||||
|  |      | ||||||
|  |     url                   = ''    # The URL of the website | ||||||
|  |     title                 = 'Default Profile'    # The title to use for the LRF file | ||||||
|  |     max_articles_per_feed = 10    # Maximum number of articles to download from each feed  | ||||||
|  |     html_description      = False # If True process the <description> element of the feed as HTML | ||||||
|  |     oldest_article        = 7     # How many days old should the oldest article downloaded from the feeds be? | ||||||
|  |     max_recursions        = 1     # Number of levels of links to follow | ||||||
|  |     max_files             = 3000  # Maximum number of files to download | ||||||
|  |     delay                 = 0     # Delay between consecutive downloads | ||||||
|  |     timeout               = 10    # Timeout for fetching files from server in seconds | ||||||
|  |     timefmt               = ' [%a %d %b %Y]' # The format of the date shown on the first page | ||||||
|  |     no_stylesheets        = False # Download stylesheets only if False  | ||||||
|  |     match_regexps         = []    # List of regular expressions that determines which links to follow | ||||||
|  |     filter_regexps        = []    # List of regular expressions that determines which links to ignore | ||||||
|  |     # Only one of match_regexps or filter_regexps should be defined | ||||||
|  |      | ||||||
|  |     html2lrf_options   = []    # List of options to pass to html2lrf | ||||||
|  |     # List of regexp substitution rules to run on the downloaded HTML. Each element of the  | ||||||
|  |     # list should be a two element tuple. THe first element of the tuple should | ||||||
|  |     # be a compiled regular expression and the second a callable that takes | ||||||
|  |     # a single match object and returns a string to replace the match. | ||||||
|  |     preprocess_regexps = [] | ||||||
|  |      | ||||||
|  |     # See the built-in profiles for examples of these settings. | ||||||
|  |      | ||||||
|  |     def get_feeds(self): | ||||||
|  |         ''' | ||||||
|  |         Return a list of RSS feeds to fetch for this profile. Each element of the list | ||||||
|  |         must be a 2-element tuple of the form (title, url). | ||||||
|  |         ''' | ||||||
|  |         raise NotImplementedError | ||||||
|  |      | ||||||
|  |     @classmethod | ||||||
|  |     def print_version(cls, url): | ||||||
|  |         ''' | ||||||
|  |         Takea a URL pointing to an article and returns the URL pointing to the | ||||||
|  |         print version of the article. | ||||||
|  |         ''' | ||||||
|  |         return url | ||||||
|  |      | ||||||
|  |     @classmethod | ||||||
|  |     def get_browser(cls): | ||||||
|  |         ''' | ||||||
|  |         Return a browser instance used to fetch documents from the web. | ||||||
|  |          | ||||||
|  |         If your profile requires that you login first, override this method | ||||||
|  |         in your subclass. See for example the nytimes profile. | ||||||
|  |         ''' | ||||||
|  |         return browser() | ||||||
|  |      | ||||||
|  |     ######################################################################## | ||||||
|  |     ###################### End of customizable portion ##################### | ||||||
|  |     ######################################################################## | ||||||
|  |      | ||||||
|  |      | ||||||
|  |     def __init__(self, username=None, password=None): | ||||||
|  |         self.username = username | ||||||
|  |         self.password = password | ||||||
|  |         self.temp_dir = tempfile.mkdtemp(prefix=__appname__+'_') | ||||||
|  |         self.browser = self.get_browser() | ||||||
|  |         self.url = 'file:'+ ('' if iswindows else '//') + self.build_index() | ||||||
|  |      | ||||||
|  |     def __del__(self): | ||||||
|  |         import os, shutil | ||||||
|  |         if os.path.isdir(self.temp_dir): | ||||||
|  |             shutil.rmtree(self.temp_dir) | ||||||
|  |      | ||||||
|  |     def build_index(self): | ||||||
|  |         '''Build an RSS based index.html''' | ||||||
|  |         import os | ||||||
|  |         articles = self.parse_feeds() | ||||||
|  |          | ||||||
|  |      | ||||||
|  |         def build_sub_index(title, items): | ||||||
|  |             ilist = '' | ||||||
|  |             li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\ | ||||||
|  |                 u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n' | ||||||
|  |             for item in items: | ||||||
|  |                 ilist += li%item | ||||||
|  |             return u'''\ | ||||||
|  |             <html> | ||||||
|  |             <body> | ||||||
|  |             <h2>%(title)s</h2> | ||||||
|  |             <ul> | ||||||
|  |             %(items)s | ||||||
|  |             </ul> | ||||||
|  |             </body> | ||||||
|  |             </html> | ||||||
|  |             '''%dict(title=title, items=ilist.rstrip())         | ||||||
|  |          | ||||||
|  |         cnum = 0 | ||||||
|  |         clist = '' | ||||||
|  |         categories = articles.keys() | ||||||
|  |         categories.sort() | ||||||
|  |         for category in categories: | ||||||
|  |             cnum  += 1 | ||||||
|  |             cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html') | ||||||
|  |             prefix = 'file:' if iswindows else '' | ||||||
|  |             clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category) | ||||||
|  |             src = build_sub_index(category, articles[category]) | ||||||
|  |             open(cfile, 'wb').write(src.encode('utf-8'))         | ||||||
|  |          | ||||||
|  |         src = '''\ | ||||||
|  |         <html> | ||||||
|  |         <body> | ||||||
|  |         <h1>%(title)s</h1> | ||||||
|  |         <div style='text-align: right; font-weight: bold'>%(date)s</div> | ||||||
|  |         <ul> | ||||||
|  |         %(categories)s | ||||||
|  |         </ul> | ||||||
|  |         </body> | ||||||
|  |         </html> | ||||||
|  |         '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()),  | ||||||
|  |                  categories=clist, title=self.title) | ||||||
|  |         index = os.path.join(self.temp_dir, 'index.html') | ||||||
|  |         open(index, 'wb').write(src.encode('utf-8')) | ||||||
|  |         return index | ||||||
|  | 
 | ||||||
|  |      | ||||||
|  |     def parse_feeds(self): | ||||||
|  |         feeds = self.get_feeds() | ||||||
|  |         articles = {} | ||||||
|  |         for title, url in feeds: | ||||||
|  |             try: | ||||||
|  |                 src = self.browser.open(url).read() | ||||||
|  |             except Exception, err: | ||||||
|  |                 print 'Could not fetch feed: %s\nError: %s'%(url, err) | ||||||
|  |                 continue | ||||||
|  |              | ||||||
|  |             articles[title] = [] | ||||||
|  |             soup = BeautifulStoneSoup(src) | ||||||
|  |             for item in soup.findAll('item'): | ||||||
|  |                 try: | ||||||
|  |                     pubdate = item.find('pubdate').string | ||||||
|  |                     if not pubdate: | ||||||
|  |                         continue | ||||||
|  |                     pubdate = pubdate.replace('+0000', 'GMT') | ||||||
|  |                     d = {  | ||||||
|  |                         'title'    : item.find('title').string,                  | ||||||
|  |                         'url'      : self.print_version(item.find('guid').string), | ||||||
|  |                         'timestamp': calendar.timegm(self.strptime(pubdate)), | ||||||
|  |                         'date'     : pubdate | ||||||
|  |                         } | ||||||
|  |                     delta = time.time() - d['timestamp'] | ||||||
|  |                     if delta > self.oldest_article*3600*24: | ||||||
|  |                         continue | ||||||
|  |                       | ||||||
|  |                 except Exception, err: | ||||||
|  |                     continue | ||||||
|  |                 try: | ||||||
|  |                     desc = item.find('description') | ||||||
|  |                     d['description'] = self.process_html_description(desc) if  self.html_description else desc.string                     | ||||||
|  |                 except: | ||||||
|  |                     d['description'] = '' | ||||||
|  |                 articles[title].append(d) | ||||||
|  |             articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True) | ||||||
|  |             articles[title][self.max_articles_per_feed:] = [] | ||||||
|  |             for item in articles[title]: | ||||||
|  |                 item.pop('timestamp') | ||||||
|  |             if not articles[title]: | ||||||
|  |                 articles.pop(title) | ||||||
|  |         return articles | ||||||
|  | 
 | ||||||
|  |      | ||||||
|  |     @classmethod | ||||||
|  |     def process_html_description(cls, tag): | ||||||
|  |         src = '\n'.join(tag.contents) | ||||||
|  |         replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] | ||||||
|  |         for e in replaced_entities: | ||||||
|  |             ent = '&'+e+';' | ||||||
|  |             src = src.replace(ent, unichr(name2codepoint[e])) | ||||||
|  |         return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src) | ||||||
|  | 
 | ||||||
|  |      | ||||||
|  |     DAY_MAP   = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6) | ||||||
|  |     MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12) | ||||||
|  |     FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6,  | ||||||
|  |                       July=7, August=8, September=9, October=10,  | ||||||
|  |                       November=11, December=12) | ||||||
|  |          | ||||||
|  |     @classmethod | ||||||
|  |     def strptime(cls, src): | ||||||
|  |         src = src.strip().split() | ||||||
|  |         src[0] = str(cls.DAY_MAP[src[0][:-1]])+',' | ||||||
|  |         try: | ||||||
|  |             src[2] = str(cls.MONTH_MAP[src[2]]) | ||||||
|  |         except KeyError: | ||||||
|  |             src[2] = str(cls.FULL_MONTH_MAP[src[2]]) | ||||||
|  |         return time.strptime(' '.join(src), '%w, %d %m %Y %H:%M:%S %Z') | ||||||
|  |      | ||||||
|  |     def command_line_options(self): | ||||||
|  |         args = [] | ||||||
|  |         args.append('--max-recursions='+str(self.max_recursions)) | ||||||
|  |         args.append('--delay='+str(self.delay)) | ||||||
|  |         args.append('--max-files='+str(self.max_files)) | ||||||
|  |         for i in self.match_regexps: | ||||||
|  |             args.append('--match-regexp="'+i+'"') | ||||||
|  |         for i in self.filter_regexps: | ||||||
|  |             args.append('--filter-regexp="'+i+'"') | ||||||
|  |         return args | ||||||
|  |          | ||||||
|  |      | ||||||
							
								
								
									
										57
									
								
								src/libprs500/ebooks/lrf/web/profiles/bbc.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										57
									
								
								src/libprs500/ebooks/lrf/web/profiles/bbc.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,57 @@ | |||||||
|  | ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net | ||||||
|  | ##    This program is free software; you can redistribute it and/or modify | ||||||
|  | ##    it under the terms of the GNU General Public License as published by | ||||||
|  | ##    the Free Software Foundation; either version 2 of the License, or | ||||||
|  | ##    (at your option) any later version. | ||||||
|  | ## | ||||||
|  | ##    This program is distributed in the hope that it will be useful, | ||||||
|  | ##    but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  | ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  | ##    GNU General Public License for more details. | ||||||
|  | ## | ||||||
|  | ##    You should have received a copy of the GNU General Public License along | ||||||
|  | ##    with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  | ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  | ''' | ||||||
|  | Fetch the BBC. | ||||||
|  | ''' | ||||||
|  | import re | ||||||
|  | 
 | ||||||
|  | from libprs500.ebooks.lrf.web.profiles import DefaultProfile | ||||||
|  | from libprs500.ebooks.BeautifulSoup import BeautifulSoup | ||||||
|  | 
 | ||||||
|  | class BBC(DefaultProfile): | ||||||
|  |      | ||||||
|  |     title = 'The BBC' | ||||||
|  |     max_recursions = 2 | ||||||
|  |     timefmt  = ' [%a, %d %b, %Y]' | ||||||
|  |     no_stylesheets = True | ||||||
|  |      | ||||||
|  |     preprocess_regexps = \ | ||||||
|  |         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  | ||||||
|  |               [ | ||||||
|  |                # Remove footer from individual stories | ||||||
|  |                (r'<div class=.footer.>.*?Published',  | ||||||
|  |                 lambda match : '<p></p><div class="footer">Published'), | ||||||
|  |                # Add some style info in place of disabled stylesheet | ||||||
|  |                (r'<link.*?type=.text/css.*?>', lambda match : | ||||||
|  |                 '''<style type="text/css"> | ||||||
|  |                     .headline {font-size: x-large;} | ||||||
|  |                     .fact { padding-top: 10pt  } | ||||||
|  |                     </style>'''), | ||||||
|  |                ] | ||||||
|  |                   ] | ||||||
|  |      | ||||||
|  |          | ||||||
|  |     def print_version(self, url): | ||||||
|  |         return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/') | ||||||
|  |      | ||||||
|  |     def get_feeds(self): | ||||||
|  |         src = self.browser.open('http://news.bbc.co.uk/1/hi/help/3223484.stm').read() | ||||||
|  |         soup = BeautifulSoup(src[src.index('<html'):]) | ||||||
|  |         feeds = [] | ||||||
|  |         ul =  soup.find('ul', attrs={'class':'rss'}) | ||||||
|  |         for link in ul.findAll('a'): | ||||||
|  |             feeds.append((link.string, link['href'])) | ||||||
|  |         return feeds | ||||||
|  | 
 | ||||||
							
								
								
									
										81
									
								
								src/libprs500/ebooks/lrf/web/profiles/economist.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										81
									
								
								src/libprs500/ebooks/lrf/web/profiles/economist.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,81 @@ | |||||||
|  | ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net | ||||||
|  | ##    This program is free software; you can redistribute it and/or modify | ||||||
|  | ##    it under the terms of the GNU General Public License as published by | ||||||
|  | ##    the Free Software Foundation; either version 2 of the License, or | ||||||
|  | ##    (at your option) any later version. | ||||||
|  | ## | ||||||
|  | ##    This program is distributed in the hope that it will be useful, | ||||||
|  | ##    but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  | ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  | ##    GNU General Public License for more details. | ||||||
|  | ## | ||||||
|  | ##    You should have received a copy of the GNU General Public License along | ||||||
|  | ##    with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  | ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  | ''' | ||||||
|  | Fetch The Economist. | ||||||
|  | ''' | ||||||
|  | import re | ||||||
|  | 
 | ||||||
|  | from libprs500.ebooks.lrf.web.profiles import DefaultProfile | ||||||
|  | from libprs500.ebooks.BeautifulSoup import BeautifulSoup | ||||||
|  | 
 | ||||||
|  | class Economist(DefaultProfile): | ||||||
|  |      | ||||||
|  |     title = 'The Economist' | ||||||
|  |     timefmt = ' [%d %b %Y]' | ||||||
|  |     max_recursions = 3 | ||||||
|  |      | ||||||
|  |     TITLES = [ | ||||||
|  |           'The world this week', | ||||||
|  |           'Letters', | ||||||
|  |           'Briefings', | ||||||
|  |           'Special reports', | ||||||
|  |           'Britain', | ||||||
|  |           'Europe', | ||||||
|  |           'United States', | ||||||
|  |           'The Americas', | ||||||
|  |           'Middle East and Africa', | ||||||
|  |           'Asia', | ||||||
|  |           'International', | ||||||
|  |           'Business', | ||||||
|  |           'Finance and economics', | ||||||
|  |           'Science and technology', | ||||||
|  |           'Books and arts', | ||||||
|  |           'Indicators' | ||||||
|  |           ] | ||||||
|  |      | ||||||
|  |     preprocess_regexps = \ | ||||||
|  |         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  | ||||||
|  |             [ | ||||||
|  |              # Remove advert | ||||||
|  |              (r'<noscript.*?</noscript>', lambda match: ''), | ||||||
|  |              ] | ||||||
|  |             ] | ||||||
|  |      | ||||||
|  |     def __init__(self, username=None, password=None): | ||||||
|  |         DefaultProfile.__init__(self, username, password) | ||||||
|  |         self.browser = None # Needed as otherwise there are timeouts while fetching actual articles | ||||||
|  |      | ||||||
|  |     def print_version(self, url): | ||||||
|  |         return url.replace('displaystory', 'PrinterFriendly').replace('&fsrc=RSS', '') | ||||||
|  |      | ||||||
|  |     def get_feeds(self): | ||||||
|  |         src = self.browser.open('http://economist.com/rss/').read() | ||||||
|  |         soup = BeautifulSoup(src) | ||||||
|  |         feeds = [] | ||||||
|  |         for ul in soup.findAll('ul'): | ||||||
|  |             lis =  ul.findAll('li') | ||||||
|  |             try: | ||||||
|  |                 title, link = lis[0], lis[1] | ||||||
|  |             except IndexError: | ||||||
|  |                 continue | ||||||
|  |             title = title.string | ||||||
|  |             if title: | ||||||
|  |                 title = title.strip() | ||||||
|  |             if title not in self.__class__.TITLES: | ||||||
|  |                 continue | ||||||
|  |             a = link.find('a') | ||||||
|  |             feeds.append((title, a['href'].strip())) | ||||||
|  |              | ||||||
|  |         return feeds | ||||||
| @ -12,14 +12,27 @@ | |||||||
| ##    You should have received a copy of the GNU General Public License along | ##    You should have received a copy of the GNU General Public License along | ||||||
| ##    with this program; if not, write to the Free Software Foundation, Inc., | ##    with this program; if not, write to the Free Software Foundation, Inc., | ||||||
| ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
| '''Logic to create a Newsweek HTML aggregator from RSS feeds''' | ''' | ||||||
|  | Profile to download Newsweek | ||||||
|  | ''' | ||||||
|  | from libprs500.ebooks.lrf.web.profiles import DefaultProfile | ||||||
| 
 | 
 | ||||||
| import tempfile, os, shutil | class Newsweek(DefaultProfile): | ||||||
|      |      | ||||||
| from libprs500.ebooks.lrf.web import build_index, parse_feeds |     title = 'Newsweek' | ||||||
| from libprs500 import __appname__, iswindows, browser |     max_recursions = 2 | ||||||
|  |     timefmt  = ' [%d %b %Y]' | ||||||
|  |     html_description = True | ||||||
|  |     oldest_article        = 15 | ||||||
|      |      | ||||||
| RSS_FEEDS = [ |          | ||||||
|  |     def print_version(self, url): | ||||||
|  |         if not url.endswith('/'): | ||||||
|  |             url += '/' | ||||||
|  |         return url + 'output/print' | ||||||
|  |      | ||||||
|  |     def get_feeds(self): | ||||||
|  |         return [ | ||||||
|              ('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',), |              ('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',), | ||||||
|              ('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'), |              ('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'), | ||||||
|              ('Politics', 'http://feeds.newsweek.com/headlines/politics'), |              ('Politics', 'http://feeds.newsweek.com/headlines/politics'), | ||||||
| @ -34,26 +47,3 @@ RSS_FEEDS = [ | |||||||
|              ] |              ] | ||||||
|          |          | ||||||
|          |          | ||||||
| def print_version(url): |  | ||||||
|     if '?' in url: |  | ||||||
|         url = url[:url.index('?')] |  | ||||||
|     if not url.endswith('/'): |  | ||||||
|         url += '/' |  | ||||||
|     return url + 'output/print' |  | ||||||
| 
 |  | ||||||
| def initialize(profile): |  | ||||||
|     profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_') |  | ||||||
|     profile['browser'] = browser() |  | ||||||
|     articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version,  |  | ||||||
|                            max_articles_per_feed=20, oldest_article=15,  |  | ||||||
|                            html_description=True) |  | ||||||
|     index = build_index('Newsweek', articles, profile['temp dir']) |  | ||||||
|     profile['url'] = 'file:'+ ('' if iswindows else '//') + index |  | ||||||
|     profile['timefmt'] = ' [%d %b %Y]' |  | ||||||
|     profile['max_recursions'] =  2 |  | ||||||
|     profile['title']          = 'Newsweek' |  | ||||||
|     profile['url'] = 'file:'+ ('' if iswindows else '//') +index |  | ||||||
| 
 |  | ||||||
| def finalize(profile): |  | ||||||
|     if os.path.isdir(profile['temp dir']): |  | ||||||
|         shutil.rmtree(profile['temp dir']) |  | ||||||
							
								
								
									
										66
									
								
								src/libprs500/ebooks/lrf/web/profiles/nytimes.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										66
									
								
								src/libprs500/ebooks/lrf/web/profiles/nytimes.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,66 @@ | |||||||
|  | ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net | ||||||
|  | ##    This program is free software; you can redistribute it and/or modify | ||||||
|  | ##    it under the terms of the GNU General Public License as published by | ||||||
|  | ##    the Free Software Foundation; either version 2 of the License, or | ||||||
|  | ##    (at your option) any later version. | ||||||
|  | ## | ||||||
|  | ##    This program is distributed in the hope that it will be useful, | ||||||
|  | ##    but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  | ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  | ##    GNU General Public License for more details. | ||||||
|  | ## | ||||||
|  | ##    You should have received a copy of the GNU General Public License along | ||||||
|  | ##    with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  | ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  | ''' | ||||||
|  | Profile to download the New York Times | ||||||
|  | ''' | ||||||
|  | import re | ||||||
|  | 
 | ||||||
|  | from libprs500.ebooks.lrf.web.profiles import DefaultProfile | ||||||
|  | from libprs500.ebooks.BeautifulSoup import BeautifulSoup | ||||||
|  | 
 | ||||||
|  | class NYTimes(DefaultProfile): | ||||||
|  |      | ||||||
|  |     title   = 'The New York Times' | ||||||
|  |     timefmt = ' [%a, %d %b, %Y]' | ||||||
|  |     max_recursions = 2 | ||||||
|  |      | ||||||
|  |     preprocess_regexps = \ | ||||||
|  |             [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  | ||||||
|  |               [ | ||||||
|  |                # Remove header bar | ||||||
|  |                (r'(<body.*?>).*?<h1', lambda match: match.group(1)+'<h1'), | ||||||
|  |                (r'<div class="articleTools">.*></ul>', lambda match : ''), | ||||||
|  |                # Remove footer bar | ||||||
|  |                (r'<\!--  end \#article -->.*', lambda match : '</body></html>'), | ||||||
|  |                (r'<div id="footer">.*', lambda match : '</body></html>'), | ||||||
|  |                ] | ||||||
|  |               ] | ||||||
|  |                | ||||||
|  |     def browser(self): | ||||||
|  |         br = DefaultProfile.browser(self) | ||||||
|  |         if self.username is not None and self.password is not None: | ||||||
|  |             br.open('http://www.nytimes.com/auth/login') | ||||||
|  |             br.select_form(name='login') | ||||||
|  |             br['USERID']   = self.username | ||||||
|  |             br['PASSWORD'] = self.password | ||||||
|  |             br.submit() | ||||||
|  |         return br | ||||||
|  |      | ||||||
|  |     def get_feeds(self): | ||||||
|  |         src = self.browser.open('http://www.nytimes.com/services/xml/rss/index.html').read() | ||||||
|  |         soup = BeautifulSoup(src[src.index('<html'):]) | ||||||
|  |         feeds = [] | ||||||
|  |         for link in soup.findAll('link', attrs={'type':'application/rss+xml'}): | ||||||
|  |             if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts',  | ||||||
|  |                                      'Dining & Wine', 'Home & Garden', 'Multimedia', | ||||||
|  |                                      'Most E-mailed Articles',  | ||||||
|  |                                      'Automobiles', 'Fashion & Style', 'Television News', | ||||||
|  |                                      'Education']: | ||||||
|  |                 feeds.append((link['title'], link['href']))             | ||||||
|  |          | ||||||
|  |         return feeds | ||||||
|  |      | ||||||
|  |     def print_version(self, url): | ||||||
|  |         return url + '?&pagewanted=print' | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user