From 684d03da1f22224ca4b5a00eb3929b1f7c856bbe Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 18 Aug 2007 17:06:00 +0000 Subject: [PATCH] New BBC profile. --- src/libprs500/ebooks/lrf/web/__init__.py | 85 ++++++++++++++++++ src/libprs500/ebooks/lrf/web/bbc.py | 53 +++++++++++ src/libprs500/ebooks/lrf/web/convert_from.py | 85 ++++++++++-------- src/libprs500/ebooks/lrf/web/nytimes.py | 94 +++----------------- src/libprs500/ebooks/lrf/web/profiles.py | 28 +++--- 5 files changed, 207 insertions(+), 138 deletions(-) create mode 100644 src/libprs500/ebooks/lrf/web/bbc.py diff --git a/src/libprs500/ebooks/lrf/web/__init__.py b/src/libprs500/ebooks/lrf/web/__init__.py index aaf49de99e..94743b0eca 100644 --- a/src/libprs500/ebooks/lrf/web/__init__.py +++ b/src/libprs500/ebooks/lrf/web/__init__.py @@ -12,3 +12,88 @@ ## You should have received a copy of the GNU General Public License along ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +import os, time, calendar, operator + +from libprs500 import iswindows +from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup + +def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10): + articles = {} + for title, url in feeds: + src = browser.open(url).read() + articles[title] = [] + soup = BeautifulStoneSoup(src) + for item in soup.findAll('item'): + try: + pubdate = item.find('pubdate').string + if not pubdate: + continue + pubdate = pubdate.replace('+0000', 'GMT') + d = { + 'title' : item.find('title').string, + 'url' : print_version(item.find('guid').string), + 'timestamp': calendar.timegm(time.strptime(pubdate, + '%a, %d %b %Y %H:%M:%S %Z')), + 'date' : pubdate + } + except: + continue + try: + d['description'] = item.find('description').string + except: + d['description'] = '' + articles[title].append(d) + articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True) + articles[title][max_articles_per_feed:] = [] + for item in articles[title]: + item.pop('timestamp') + return articles + + +def build_index(title, articles, dir): + '''Build an RSS based index.html''' + + def build_sub_index(title, items): + ilist = '' + li = u'
  • %(title)s [%(date)s]
    \n'+\ + u'
    %(description)s
  • \n' + for item in items: + ilist += li%item + return u'''\ + + +

    %(title)s

    + + + + '''%dict(title=title, items=ilist.rstrip()) + + cnum = 0 + clist = '' + categories = articles.keys() + categories.sort() + for category in categories: + cnum += 1 + cfile = os.path.join(dir, 'category'+str(cnum)+'.html') + prefix = 'file:' if iswindows else '' + clist += u'
  • %s
  • \n'%(prefix+cfile, category) + src = build_sub_index(category, articles[category]) + open(cfile, 'wb').write(src.encode('utf-8')) + + src = '''\ + + +

    The New York Times

    +
    %(date)s
    + + + + '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), categories=clist) + index = os.path.join(dir, 'index.html') + open(index, 'wb').write(src.encode('utf-8')) + return index diff --git a/src/libprs500/ebooks/lrf/web/bbc.py b/src/libprs500/ebooks/lrf/web/bbc.py new file mode 100644 index 0000000000..01c63eb4d9 --- /dev/null +++ b/src/libprs500/ebooks/lrf/web/bbc.py @@ -0,0 +1,53 @@ +## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + +import tempfile, shutil, os +from libprs500.ebooks.lrf.web import build_index, parse_feeds + +RSS = 'http://news.bbc.co.uk/1/hi/help/3223484.stm' + +from libprs500 import __appname__, iswindows, browser +from libprs500.ebooks.BeautifulSoup import BeautifulSoup + + +def get_feeds(browser): + src = browser.open(RSS).read() + soup = BeautifulSoup(src[src.index('%(title)s [%(date)s]
    \n'+\ - u'
    %(description)s
    \n' - for item in items: - ilist += li%item - return u'''\ - - -

    %(title)s

    - - - - '''%dict(title=title, items=ilist.rstrip()) - - cnum = 0 - clist = '' - categories = articles.keys() - categories.sort() - for category in categories: - cnum += 1 - cfile = os.path.join(dir, 'category'+str(cnum)+'.html') - prefix = 'file:' if iswindows else '' - clist += u'
  • %s
  • \n'%(prefix+cfile, category) - src = build_sub_index(category, articles[category]) - open(cfile, 'wb').write(src.encode('utf-8')) - - src = '''\ - - -

    The New York Times

    -
    %(date)s
    - - - - '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), categories=clist) - index = os.path.join(dir, 'index.html') - open(index, 'wb').write(src.encode('utf-8')) - return index - - def initialize(profile): profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_') profile['browser'] = login(profile) feeds = get_feeds(profile['browser']) - articles = parse_feeds(feeds, profile['browser']) - index = build_index(articles, profile['temp dir']) + articles = parse_feeds(feeds, profile['browser'], lambda x: x + '?&pagewanted=print') + index = build_index('The New York Times', articles, profile['temp dir']) profile['url'] = 'file:'+ ('' if iswindows else '//') + index + profile['timefmt'] = ' [%a, %d %b, %Y]' + profile['max_recursions'] = 2 + profile['title'] = 'The New York Times' def finalize(profile): - shutil.rmtree(profile['temp dir']) + if os.path.isdir(profile['temp dir']): + shutil.rmtree(profile['temp dir']) def login(profile): diff --git a/src/libprs500/ebooks/lrf/web/profiles.py b/src/libprs500/ebooks/lrf/web/profiles.py index b076858059..951b9173fa 100644 --- a/src/libprs500/ebooks/lrf/web/profiles.py +++ b/src/libprs500/ebooks/lrf/web/profiles.py @@ -19,6 +19,8 @@ from libprs500.ebooks.lrf.web.newsweek import initialize as newsweek_initialize from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize from libprs500.ebooks.lrf.web.nytimes import initialize as nytimes_initialize from libprs500.ebooks.lrf.web.nytimes import finalize as nytimes_finalize +from libprs500.ebooks.lrf.web.bbc import initialize as bbc_initialize +from libprs500.ebooks.lrf.web.bbc import finalize as bbc_finalize profiles = { @@ -42,9 +44,7 @@ profiles = { 'nytimes' : { 'initialize' : nytimes_initialize, 'finalize' : nytimes_finalize, - 'timefmt' : ' [%a, %d %b, %Y]', - 'max_recursions' : 2, - 'title' : 'The New York Times', + 'preprocess_regexps' : [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ @@ -59,26 +59,24 @@ profiles = { }, 'bbc' : { - 'title' : 'The BBC', - 'no_stylesheets' : True, - 'preprocess_regexps' : + 'initialize' : bbc_initialize, + 'finalize' : bbc_finalize, + 'preprocess_regexps' : [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ - # Remove help link and replace by title - (r'', - lambda match: '

    The BBC

    \n

    %s

    '%(time.strftime('%a %d %b %Y', time.localtime()),)), - # Blank line before categories - (r'\s*BBC', lambda match: '

    BBC'), # Remove footer from individual stories (r'
    .*?Published', lambda match : '