From e07305bcacdc227ec36c16648e997ec438b823ff Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 14 Jul 2007 18:27:38 +0000 Subject: [PATCH] Implemented newsweek profile. --- src/libprs500/__init__.py | 2 +- src/libprs500/ebooks/lrf/web/convert_from.py | 11 +- src/libprs500/ebooks/lrf/web/newsweek.py | 141 +++++++++++++++++++ src/libprs500/ebooks/lrf/web/profiles.py | 37 ++++- 4 files changed, 185 insertions(+), 6 deletions(-) create mode 100644 src/libprs500/ebooks/lrf/web/newsweek.py diff --git a/src/libprs500/__init__.py b/src/libprs500/__init__.py index 1b2963a5fa..66127b3d38 100644 --- a/src/libprs500/__init__.py +++ b/src/libprs500/__init__.py @@ -13,7 +13,7 @@ ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ''' E-book management software''' -__version__ = "0.3.70" +__version__ = "0.3.71" __docformat__ = "epytext" __author__ = "Kovid Goyal " __appname__ = 'libprs500' diff --git a/src/libprs500/ebooks/lrf/web/convert_from.py b/src/libprs500/ebooks/lrf/web/convert_from.py index 23a767599e..33696b5ccf 100644 --- a/src/libprs500/ebooks/lrf/web/convert_from.py +++ b/src/libprs500/ebooks/lrf/web/convert_from.py @@ -61,8 +61,7 @@ def option_parser(): def fetch_website(options): tdir = tempfile.mkdtemp(prefix=__appname__+'_' ) - options.dir = tdir - web2disk_setup_logger(options) + options.dir = tdir fetcher = create_fetcher(options) fetcher.preprocess_regexps = options.preprocess_regexps return fetcher.start_fetch(options.url), tdir @@ -77,6 +76,7 @@ def create_lrf(htmlfile, options): def main(args=sys.argv): parser = option_parser() options, args = parser.parse_args(args) + web2disk_setup_logger(options) if len(args) > 2: parser.print_help() return 1 @@ -87,6 +87,9 @@ def main(args=sys.argv): return 1 profile = profiles[args[1]] if len(args) == 2 else profiles['default'] + if profile.has_key('initialize'): + profile['initialize'](profile) + for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'): val = getattr(options, opt) if val is None: @@ -103,7 +106,7 @@ def main(args=sys.argv): title = profile['title'] if not title: title = urlsplit(options.url).netloc - options.title = title + time.strftime(' [%a %d %b %Y]', time.localtime()) + options.title = title + time.strftime(profile['timefmt'], time.localtime()) options.match_regexps += profile['match_regexps'] options.preprocess_regexps = profile['preprocess_regexps'] @@ -111,6 +114,8 @@ def main(args=sys.argv): htmlfile, tdir = fetch_website(options) create_lrf(htmlfile, options) + if profile.has_key('finalize'): + profile['finalize'](profile) shutil.rmtree(tdir) return 0 diff --git a/src/libprs500/ebooks/lrf/web/newsweek.py b/src/libprs500/ebooks/lrf/web/newsweek.py new file mode 100644 index 0000000000..daf3a9641b --- /dev/null +++ b/src/libprs500/ebooks/lrf/web/newsweek.py @@ -0,0 +1,141 @@ +## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +'''Logic to create a Newsweek HTML aggregator from RSS feeds''' + +import sys, urllib2, time, re, tempfile, os, shutil + +from libprs500 import __appname__ +from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup +from htmlentitydefs import name2codepoint + +RSS_FEEDS = [ + ('Cover Story', 'http://feeds.newsweek.com/CoverStory'), + ('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'), + ('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'), + ('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'), + ('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'), + ('Health', 'http://feeds.newsweek.com/sections/health'), + ('Society', 'http://feeds.newsweek.com/newsweek/society'), + ('Business', 'http://feeds.newsweek.com/newsweek/business'), + ('Science and Technology', 'http://feeds.newsweek.com/newsweek/TechnologyScience'), + ('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'), + ('Tip Sheet', 'http://feeds.newsweek.com/newsweek/TipSheet/Highlights'), + ] + +BASE_TEMPLATE=\ +u''' + + +

Newsweek

+%(date)s +

+

Table of Contents

+ +
+
+ + +''' + +SECTION_TEMPLATE=\ +u''' + + +

%(title)s

+

+

Table of Contents

+ +
+
+ + +''' + +_tdir = None +def create_aggregator(sections): + '''Return aggregator HTML encoded in utf8''' + toc, sec = u'', 0 + global _tdir + _tdir = tempfile.mkdtemp(prefix=__appname__) + for section in sections: + sec += 1 + secfile = os.path.join(_tdir, 'sec%d.html'%(sec,)) + title, contents = section + toc += '
  • %s
  • \n'%(secfile, title,) + stoc = u'' + for item in contents: + desc = item['description'].strip() + stoc += '
  • %(title)s
    '%dict(link=item['link'], title=item['title']) + if desc: + stoc += '
    %s
    \n'%(desc,) + stoc += '
  • \n' + section = SECTION_TEMPLATE%dict(title=title, toc=stoc) + open(secfile, 'w').write(section.encode('utf8')) + index = os.path.join(_tdir, 'index.html') + src = BASE_TEMPLATE % dict(toc=toc, date=time.strftime('%d %B, %Y', time.localtime())) + open(index, 'w').write(src.encode('utf8')) + return index + +def get_contents(): + ''' Parse Newsweek RSS feeds to get links to all articles''' + + def nstounicode(ns): + return unicode(str(ns), 'utf8') + + def fix_link(link): + if '?' in link: + link = link[:link.index('?')] + return link + 'print/1/displaymode/1098/' + + def process_description(tag): + src = '\n'.join(tag.contents) + replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] + for e in replaced_entities: + ent = '&'+e+';' + src = src.replace(ent, unichr(name2codepoint[e])) + return re.compile(r'', re.IGNORECASE|re.DOTALL).sub('', src) + + pages = [] + for title, url in RSS_FEEDS: + soup = BeautifulStoneSoup(urllib2.urlopen(url)) + contents = [] + for item in soup.findAll('item'): + d = { + 'title' : nstounicode(item.title.contents[0]), + 'description': process_description(item.description), + 'link': fix_link(nstounicode(item.guid.contents[0])) + } + if '<' in d['description']: + d['description'] = d['description'][:d['description'].index('<')] + contents.append(d) + pages.append((title, contents)) + return pages + + +def initialize(profile): + print 'Fetching feeds...', + sys.stdout.flush() + contents = get_contents() + print 'done' + index = create_aggregator(contents) + profile['url'] = 'file://'+index + +def finalize(profile): + global _tdir + shutil.rmtree(_tdir) diff --git a/src/libprs500/ebooks/lrf/web/profiles.py b/src/libprs500/ebooks/lrf/web/profiles.py index 4b363857cc..8ca0a228d6 100644 --- a/src/libprs500/ebooks/lrf/web/profiles.py +++ b/src/libprs500/ebooks/lrf/web/profiles.py @@ -13,9 +13,11 @@ ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. '''Profiles for known websites.''' - import time, re +from libprs500.ebooks.lrf.web.newsweek import initialize as newsweek_initialize +from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize + profiles = { 'default' : { 'url' : '', # The URL of the website @@ -24,6 +26,7 @@ profiles = { 'max_files' : 1000, # Maximum number of files to download 'delay' : 0, # Delay between consecutive downloads 'timeout' : 10, # Timeout for fetching files from server + 'timefmt' : ' [%a %d %b %Y]', 'no_stylesheets' : False, # Download stylesheets 'match_regexps' : [], # List of regular expressions that determines which links to follow 'filter_regexps' : [], # List of regular expressions that determines which links to ignore @@ -78,7 +81,37 @@ profiles = { ''), ] ], - }, + }, + + 'newsweek' : { + 'initialize' : newsweek_initialize, + 'finalize' : newsweek_finalize, + 'title' : 'Newsweek', + 'timefmt' : ' [%d %b %Y]', + 'no_stylesheets' : True, + 'max_recursions' : 2, + 'preprocess_regexps' : + [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + [ + # Make fonts larger + (r'', + lambda match : \ + '''''' + ), + ] + ], + }, } for key in profiles.keys():