This commit is contained in:
Kovid Goyal 2007-10-07 20:12:25 +00:00
parent 4477a78a5b
commit b1dea424fe
4 changed files with 45 additions and 112 deletions

View File

@ -13,12 +13,32 @@
## with this program; if not, write to the Free Software Foundation, Inc., ## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import os, time, calendar, operator import os, time, calendar, operator, re
from libprs500 import iswindows from libprs500 import iswindows
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
from htmlentitydefs import name2codepoint
def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10): def process_html_description(tag):
src = '\n'.join(tag.contents)
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
for e in replaced_entities:
ent = '&'+e+';'
src = src.replace(ent, unichr(name2codepoint[e]))
return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
def parse_feeds(feeds, browser, print_version,
max_articles_per_feed=10,
html_description=False,
oldest_article=7):
'''
@param print_version: Callable that takes a url string and returns the url to
printable version of the article pointed to by the original url.
@param max_articles_per_feed: Maximum number of articles to download from each feed
@param html_description: If true the atricles descriptions are processed as HTML
@param oldest_article: A number in days. No articles older than now - oldest_aticle
will be downloaded.
'''
articles = {} articles = {}
for title, url in feeds: for title, url in feeds:
src = browser.open(url).read() src = browser.open(url).read()
@ -37,10 +57,14 @@ def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10):
'%a, %d %b %Y %H:%M:%S %Z')), '%a, %d %b %Y %H:%M:%S %Z')),
'date' : pubdate 'date' : pubdate
} }
delta = time.time() - d['timestamp']
if delta > oldest_article*3600*24:
continue
except: except:
continue continue
try: try:
d['description'] = item.find('description').string desc = item.find('description')
d['description'] = process_html_description(desc) if html_description else desc.string
except: except:
d['description'] = '' d['description'] = ''
articles[title].append(d) articles[title].append(d)

View File

@ -76,7 +76,6 @@ def initialize(profile):
profile.pop('browser') # Needed as for some reason using the same browser instance causes timeouts profile.pop('browser') # Needed as for some reason using the same browser instance causes timeouts
def finalize(profile): def finalize(profile):
return
if os.path.isdir(profile['temp dir']): if os.path.isdir(profile['temp dir']):
shutil.rmtree(profile['temp dir']) shutil.rmtree(profile['temp dir'])

View File

@ -16,9 +16,8 @@
import sys, urllib2, time, re, tempfile, os, shutil import sys, urllib2, time, re, tempfile, os, shutil
from libprs500 import __appname__, iswindows from libprs500.ebooks.lrf.web import build_index, parse_feeds
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup from libprs500 import __appname__, iswindows, browser
from htmlentitydefs import name2codepoint
RSS_FEEDS = [ RSS_FEEDS = [
('Cover Story', 'http://feeds.newsweek.com/CoverStory'), ('Cover Story', 'http://feeds.newsweek.com/CoverStory'),
@ -34,110 +33,24 @@ RSS_FEEDS = [
('Tip Sheet', 'http://feeds.newsweek.com/newsweek/TipSheet/Highlights'), ('Tip Sheet', 'http://feeds.newsweek.com/newsweek/TipSheet/Highlights'),
] ]
BASE_TEMPLATE=\
u'''
<html>
<body>
<h1>Newsweek</h1>
<b align="right">%(date)s</b>
<p></p>
<h2>Table of Contents</h2>
<ul>
%(toc)s
</ul>
<br />
<hr />
</body>
</html>
'''
SECTION_TEMPLATE=\
u'''
<html>
<body>
<h2>%(title)s</h2>
<p></p>
<h2>Table of Contents</h2>
<ul>
%(toc)s
</ul>
<br />
<hr />
</body>
</html>
'''
_tdir = None
def create_aggregator(sections):
'''Return aggregator HTML encoded in utf8'''
toc, sec = u'', 0
global _tdir
_tdir = tempfile.mkdtemp(prefix=__appname__)
for section in sections:
sec += 1
secfile = os.path.join(_tdir, 'sec%d.html'%(sec,))
title, contents = section
fix = 'file:' if iswindows else ''
toc += '<li><a href="%s">%s</a></li>\n'%(fix+secfile, title,)
stoc = u''
for item in contents:
desc = item['description'].strip()
stoc += '<li><a href="%(link)s">%(title)s</a><br />'%dict(link=item['link'], title=item['title'])
if desc:
stoc += '<div style="font-size:small; font-family:sans">%s</div>\n'%(desc,)
stoc += '</li>\n'
section = SECTION_TEMPLATE%dict(title=title, toc=stoc)
open(secfile, 'w').write(section.encode('utf8'))
index = os.path.join(_tdir, 'index.html')
src = BASE_TEMPLATE % dict(toc=toc, date=time.strftime('%d %B, %Y', time.localtime()))
open(index, 'w').write(src.encode('utf8'))
return index
def get_contents():
''' Parse Newsweek RSS feeds to get links to all articles'''
def nstounicode(ns):
return unicode(str(ns), 'utf8')
def fix_link(link):
if '?' in link:
link = link[:link.index('?')]
return link + 'print/1/displaymode/1098/'
def process_description(tag):
src = '\n'.join(tag.contents)
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
for e in replaced_entities:
ent = '&'+e+';'
src = src.replace(ent, unichr(name2codepoint[e]))
return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
pages = []
for title, url in RSS_FEEDS:
soup = BeautifulStoneSoup(urllib2.urlopen(url))
contents = []
for item in soup.findAll('item'):
d = {
'title' : nstounicode(item.title.contents[0]),
'description': process_description(item.description),
'link': fix_link(nstounicode(item.guid.contents[0]))
}
if '&lt;' in d['description']:
d['description'] = d['description'][:d['description'].index('&lt;')]
contents.append(d)
pages.append((title, contents))
return pages
def print_version(url):
if '?' in url:
url = url[:url.index('?')]
return url + 'print/1/displaymode/1098/'
def initialize(profile): def initialize(profile):
print 'Fetching feeds...', profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
sys.stdout.flush() profile['browser'] = browser()
contents = get_contents() articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version,
print 'done' max_articles_per_feed=20, html_description=True)
index = create_aggregator(contents) index = build_index('Newsweek', articles, profile['temp dir'])
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
profile['timefmt'] = ' [%d %b %Y]'
profile['max_recursions'] = 2
profile['title'] = 'Newsweek'
profile['url'] = 'file:'+ ('' if iswindows else '//') +index profile['url'] = 'file:'+ ('' if iswindows else '//') +index
def finalize(profile): def finalize(profile):
global _tdir if os.path.isdir(profile['temp dir']):
shutil.rmtree(_tdir) shutil.rmtree(profile['temp dir'])

View File

@ -32,7 +32,7 @@ profiles = {
'max_recursions' : 1, # Number of levels of links to follow 'max_recursions' : 1, # Number of levels of links to follow
'max_files' : 1000, # Maximum number of files to download 'max_files' : 1000, # Maximum number of files to download
'delay' : 0, # Delay between consecutive downloads 'delay' : 0, # Delay between consecutive downloads
'timeout' : 10, # Timeout for fetching files from server 'timeout' : 10, # Timeout for fetching files from server in seconds
'timefmt' : ' [%a %d %b %Y]', 'timefmt' : ' [%a %d %b %Y]',
'no_stylesheets' : False, # Download stylesheets 'no_stylesheets' : False, # Download stylesheets
'match_regexps' : [], # List of regular expressions that determines which links to follow 'match_regexps' : [], # List of regular expressions that determines which links to follow
@ -82,10 +82,7 @@ profiles = {
'newsweek' : { 'newsweek' : {
'initialize' : newsweek_initialize, 'initialize' : newsweek_initialize,
'finalize' : newsweek_finalize, 'finalize' : newsweek_finalize,
'title' : 'Newsweek',
'timefmt' : ' [%d %b %Y]',
'no_stylesheets' : True, 'no_stylesheets' : True,
'max_recursions' : 2,
'preprocess_regexps' : 'preprocess_regexps' :
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[ [