This commit is contained in:
Kovid Goyal 2007-10-07 20:12:25 +00:00
parent 4477a78a5b
commit b1dea424fe
4 changed files with 45 additions and 112 deletions

View File

@ -13,12 +13,32 @@
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import os, time, calendar, operator
import os, time, calendar, operator, re
from libprs500 import iswindows
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
from htmlentitydefs import name2codepoint
def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10):
def process_html_description(tag):
src = '\n'.join(tag.contents)
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
for e in replaced_entities:
ent = '&'+e+';'
src = src.replace(ent, unichr(name2codepoint[e]))
return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
def parse_feeds(feeds, browser, print_version,
max_articles_per_feed=10,
html_description=False,
oldest_article=7):
'''
@param print_version: Callable that takes a url string and returns the url to
printable version of the article pointed to by the original url.
@param max_articles_per_feed: Maximum number of articles to download from each feed
@param html_description: If true the atricles descriptions are processed as HTML
@param oldest_article: A number in days. No articles older than now - oldest_aticle
will be downloaded.
'''
articles = {}
for title, url in feeds:
src = browser.open(url).read()
@ -37,10 +57,14 @@ def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10):
'%a, %d %b %Y %H:%M:%S %Z')),
'date' : pubdate
}
delta = time.time() - d['timestamp']
if delta > oldest_article*3600*24:
continue
except:
continue
try:
d['description'] = item.find('description').string
desc = item.find('description')
d['description'] = process_html_description(desc) if html_description else desc.string
except:
d['description'] = ''
articles[title].append(d)

View File

@ -76,7 +76,6 @@ def initialize(profile):
profile.pop('browser') # Needed as for some reason using the same browser instance causes timeouts
def finalize(profile):
return
if os.path.isdir(profile['temp dir']):
shutil.rmtree(profile['temp dir'])

View File

@ -16,9 +16,8 @@
import sys, urllib2, time, re, tempfile, os, shutil
from libprs500 import __appname__, iswindows
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
from htmlentitydefs import name2codepoint
from libprs500.ebooks.lrf.web import build_index, parse_feeds
from libprs500 import __appname__, iswindows, browser
RSS_FEEDS = [
('Cover Story', 'http://feeds.newsweek.com/CoverStory'),
@ -34,110 +33,24 @@ RSS_FEEDS = [
('Tip Sheet', 'http://feeds.newsweek.com/newsweek/TipSheet/Highlights'),
]
BASE_TEMPLATE=\
u'''
<html>
<body>
<h1>Newsweek</h1>
<b align="right">%(date)s</b>
<p></p>
<h2>Table of Contents</h2>
<ul>
%(toc)s
</ul>
<br />
<hr />
</body>
</html>
'''
SECTION_TEMPLATE=\
u'''
<html>
<body>
<h2>%(title)s</h2>
<p></p>
<h2>Table of Contents</h2>
<ul>
%(toc)s
</ul>
<br />
<hr />
</body>
</html>
'''
_tdir = None
def create_aggregator(sections):
'''Return aggregator HTML encoded in utf8'''
toc, sec = u'', 0
global _tdir
_tdir = tempfile.mkdtemp(prefix=__appname__)
for section in sections:
sec += 1
secfile = os.path.join(_tdir, 'sec%d.html'%(sec,))
title, contents = section
fix = 'file:' if iswindows else ''
toc += '<li><a href="%s">%s</a></li>\n'%(fix+secfile, title,)
stoc = u''
for item in contents:
desc = item['description'].strip()
stoc += '<li><a href="%(link)s">%(title)s</a><br />'%dict(link=item['link'], title=item['title'])
if desc:
stoc += '<div style="font-size:small; font-family:sans">%s</div>\n'%(desc,)
stoc += '</li>\n'
section = SECTION_TEMPLATE%dict(title=title, toc=stoc)
open(secfile, 'w').write(section.encode('utf8'))
index = os.path.join(_tdir, 'index.html')
src = BASE_TEMPLATE % dict(toc=toc, date=time.strftime('%d %B, %Y', time.localtime()))
open(index, 'w').write(src.encode('utf8'))
return index
def get_contents():
''' Parse Newsweek RSS feeds to get links to all articles'''
def nstounicode(ns):
return unicode(str(ns), 'utf8')
def fix_link(link):
if '?' in link:
link = link[:link.index('?')]
return link + 'print/1/displaymode/1098/'
def process_description(tag):
src = '\n'.join(tag.contents)
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
for e in replaced_entities:
ent = '&'+e+';'
src = src.replace(ent, unichr(name2codepoint[e]))
return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
pages = []
for title, url in RSS_FEEDS:
soup = BeautifulStoneSoup(urllib2.urlopen(url))
contents = []
for item in soup.findAll('item'):
d = {
'title' : nstounicode(item.title.contents[0]),
'description': process_description(item.description),
'link': fix_link(nstounicode(item.guid.contents[0]))
}
if '&lt;' in d['description']:
d['description'] = d['description'][:d['description'].index('&lt;')]
contents.append(d)
pages.append((title, contents))
return pages
def print_version(url):
if '?' in url:
url = url[:url.index('?')]
return url + 'print/1/displaymode/1098/'
def initialize(profile):
print 'Fetching feeds...',
sys.stdout.flush()
contents = get_contents()
print 'done'
index = create_aggregator(contents)
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
profile['browser'] = browser()
articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version,
max_articles_per_feed=20, html_description=True)
index = build_index('Newsweek', articles, profile['temp dir'])
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
profile['timefmt'] = ' [%d %b %Y]'
profile['max_recursions'] = 2
profile['title'] = 'Newsweek'
profile['url'] = 'file:'+ ('' if iswindows else '//') +index
def finalize(profile):
global _tdir
shutil.rmtree(_tdir)
if os.path.isdir(profile['temp dir']):
shutil.rmtree(profile['temp dir'])

View File

@ -32,7 +32,7 @@ profiles = {
'max_recursions' : 1, # Number of levels of links to follow
'max_files' : 1000, # Maximum number of files to download
'delay' : 0, # Delay between consecutive downloads
'timeout' : 10, # Timeout for fetching files from server
'timeout' : 10, # Timeout for fetching files from server in seconds
'timefmt' : ' [%a %d %b %Y]',
'no_stylesheets' : False, # Download stylesheets
'match_regexps' : [], # List of regular expressions that determines which links to follow
@ -82,10 +82,7 @@ profiles = {
'newsweek' : {
'initialize' : newsweek_initialize,
'finalize' : newsweek_finalize,
'title' : 'Newsweek',
'timefmt' : ' [%d %b %Y]',
'no_stylesheets' : True,
'max_recursions' : 2,
'preprocess_regexps' :
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[