diff --git a/src/libprs500/ebooks/lrf/web/__init__.py b/src/libprs500/ebooks/lrf/web/__init__.py index 1f3ab776df..97ad144cc4 100644 --- a/src/libprs500/ebooks/lrf/web/__init__.py +++ b/src/libprs500/ebooks/lrf/web/__init__.py @@ -13,134 +13,3 @@ ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -import os, time, calendar, operator, re - -from libprs500 import iswindows -from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup -from htmlentitydefs import name2codepoint - -DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6) -MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12) -FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6, - July=7, August=8, September=9, October=10, - November=11, December=12) - -def strptime(src): - src = src.strip().split() - src[0] = str(DAY_MAP[src[0][:-1]])+',' - try: - src[2] = str(MONTH_MAP[src[2]]) - except KeyError: - src[2] = str(FULL_MONTH_MAP[src[2]]) - return time.strptime(' '.join(src), '%w, %d %m %Y %H:%M:%S %Z') - -def process_html_description(tag): - src = '\n'.join(tag.contents) - replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] - for e in replaced_entities: - ent = '&'+e+';' - src = src.replace(ent, unichr(name2codepoint[e])) - return re.compile(r'', re.IGNORECASE|re.DOTALL).sub('', src) - -def parse_feeds(feeds, browser, print_version, - max_articles_per_feed=10, - html_description=False, - oldest_article=7): - ''' - @param print_version: Callable that takes a url string and returns the url to - printable version of the article pointed to by the original url. - @param max_articles_per_feed: Maximum number of articles to download from each feed - @param html_description: If true the atricles descriptions are processed as HTML - @param oldest_article: A number in days. No articles older than now - oldest_aticle - will be downloaded. - ''' - articles = {} - for title, url in feeds: - try: - src = browser.open(url).read() - except Exception, err: - print 'Could not fetch feed: %s\nError: %s'%(url, err) - continue - - articles[title] = [] - soup = BeautifulStoneSoup(src) - for item in soup.findAll('item'): - try: - pubdate = item.find('pubdate').string - if not pubdate: - continue - pubdate = pubdate.replace('+0000', 'GMT') - d = { - 'title' : item.find('title').string, - 'url' : print_version(item.find('guid').string), - 'timestamp': calendar.timegm(strptime(pubdate)), - 'date' : pubdate - } - delta = time.time() - d['timestamp'] - if delta > oldest_article*3600*24: - continue - - except Exception, err: - continue - try: - desc = item.find('description') - d['description'] = process_html_description(desc) if html_description else desc.string - except: - d['description'] = '' - articles[title].append(d) - articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True) - articles[title][max_articles_per_feed:] = [] - for item in articles[title]: - item.pop('timestamp') - if not articles[title]: - articles.pop(title) - return articles - - -def build_index(title, articles, dir): - '''Build an RSS based index.html''' - - def build_sub_index(title, items): - ilist = '' - li = u'
  • %(title)s [%(date)s]
    \n'+\ - u'
    %(description)s
  • \n' - for item in items: - ilist += li%item - return u'''\ - - -

    %(title)s

    - - - - '''%dict(title=title, items=ilist.rstrip()) - - cnum = 0 - clist = '' - categories = articles.keys() - categories.sort() - for category in categories: - cnum += 1 - cfile = os.path.join(dir, 'category'+str(cnum)+'.html') - prefix = 'file:' if iswindows else '' - clist += u'
  • %s
  • \n'%(prefix+cfile, category) - src = build_sub_index(category, articles[category]) - open(cfile, 'wb').write(src.encode('utf-8')) - - src = '''\ - - -

    %(title)s

    -
    %(date)s
    - - - - '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), - categories=clist, title=title) - index = os.path.join(dir, 'index.html') - open(index, 'wb').write(src.encode('utf-8')) - return index diff --git a/src/libprs500/ebooks/lrf/web/bbc.py b/src/libprs500/ebooks/lrf/web/bbc.py deleted file mode 100644 index 01c63eb4d9..0000000000 --- a/src/libprs500/ebooks/lrf/web/bbc.py +++ /dev/null @@ -1,53 +0,0 @@ -## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net -## This program is free software; you can redistribute it and/or modify -## it under the terms of the GNU General Public License as published by -## the Free Software Foundation; either version 2 of the License, or -## (at your option) any later version. -## -## This program is distributed in the hope that it will be useful, -## but WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -## GNU General Public License for more details. -## -## You should have received a copy of the GNU General Public License along -## with this program; if not, write to the Free Software Foundation, Inc., -## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - -import tempfile, shutil, os -from libprs500.ebooks.lrf.web import build_index, parse_feeds - -RSS = 'http://news.bbc.co.uk/1/hi/help/3223484.stm' - -from libprs500 import __appname__, iswindows, browser -from libprs500.ebooks.BeautifulSoup import BeautifulSoup - - -def get_feeds(browser): - src = browser.open(RSS).read() - soup = BeautifulSoup(src[src.index(').*?.*>', lambda match : ''), - # Remove footer bar - (r'<\!-- end \#article -->.*', lambda match : ''), - (r'