Implemented newsweek profile.

This commit is contained in:
Kovid Goyal 2007-07-14 18:27:38 +00:00
parent 5e8fdf7b2b
commit e07305bcac
4 changed files with 185 additions and 6 deletions

View File

@ -13,7 +13,7 @@
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
''' E-book management software'''
__version__ = "0.3.70"
__version__ = "0.3.71"
__docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
__appname__ = 'libprs500'

View File

@ -61,8 +61,7 @@ def option_parser():
def fetch_website(options):
tdir = tempfile.mkdtemp(prefix=__appname__+'_' )
options.dir = tdir
web2disk_setup_logger(options)
options.dir = tdir
fetcher = create_fetcher(options)
fetcher.preprocess_regexps = options.preprocess_regexps
return fetcher.start_fetch(options.url), tdir
@ -77,6 +76,7 @@ def create_lrf(htmlfile, options):
def main(args=sys.argv):
parser = option_parser()
options, args = parser.parse_args(args)
web2disk_setup_logger(options)
if len(args) > 2:
parser.print_help()
return 1
@ -87,6 +87,9 @@ def main(args=sys.argv):
return 1
profile = profiles[args[1]] if len(args) == 2 else profiles['default']
if profile.has_key('initialize'):
profile['initialize'](profile)
for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
val = getattr(options, opt)
if val is None:
@ -103,7 +106,7 @@ def main(args=sys.argv):
title = profile['title']
if not title:
title = urlsplit(options.url).netloc
options.title = title + time.strftime(' [%a %d %b %Y]', time.localtime())
options.title = title + time.strftime(profile['timefmt'], time.localtime())
options.match_regexps += profile['match_regexps']
options.preprocess_regexps = profile['preprocess_regexps']
@ -111,6 +114,8 @@ def main(args=sys.argv):
htmlfile, tdir = fetch_website(options)
create_lrf(htmlfile, options)
if profile.has_key('finalize'):
profile['finalize'](profile)
shutil.rmtree(tdir)
return 0

View File

@ -0,0 +1,141 @@
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''Logic to create a Newsweek HTML aggregator from RSS feeds'''
import sys, urllib2, time, re, tempfile, os, shutil
from libprs500 import __appname__
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
from htmlentitydefs import name2codepoint
RSS_FEEDS = [
('Cover Story', 'http://feeds.newsweek.com/CoverStory'),
('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'),
('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
('Health', 'http://feeds.newsweek.com/sections/health'),
('Society', 'http://feeds.newsweek.com/newsweek/society'),
('Business', 'http://feeds.newsweek.com/newsweek/business'),
('Science and Technology', 'http://feeds.newsweek.com/newsweek/TechnologyScience'),
('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
('Tip Sheet', 'http://feeds.newsweek.com/newsweek/TipSheet/Highlights'),
]
BASE_TEMPLATE=\
u'''
<html>
<body>
<h1>Newsweek</h1>
<b align="right">%(date)s</b>
<p></p>
<h2>Table of Contents</h2>
<ul>
%(toc)s
</ul>
<br />
<hr />
</body>
</html>
'''
SECTION_TEMPLATE=\
u'''
<html>
<body>
<h2>%(title)s</h2>
<p></p>
<h2>Table of Contents</h2>
<ul>
%(toc)s
</ul>
<br />
<hr />
</body>
</html>
'''
_tdir = None
def create_aggregator(sections):
'''Return aggregator HTML encoded in utf8'''
toc, sec = u'', 0
global _tdir
_tdir = tempfile.mkdtemp(prefix=__appname__)
for section in sections:
sec += 1
secfile = os.path.join(_tdir, 'sec%d.html'%(sec,))
title, contents = section
toc += '<li><a href="%s">%s</a></li>\n'%(secfile, title,)
stoc = u''
for item in contents:
desc = item['description'].strip()
stoc += '<li><a href="%(link)s">%(title)s</a><br />'%dict(link=item['link'], title=item['title'])
if desc:
stoc += '<div style="font-size:small; font-family:sans">%s</div>\n'%(desc,)
stoc += '</li>\n'
section = SECTION_TEMPLATE%dict(title=title, toc=stoc)
open(secfile, 'w').write(section.encode('utf8'))
index = os.path.join(_tdir, 'index.html')
src = BASE_TEMPLATE % dict(toc=toc, date=time.strftime('%d %B, %Y', time.localtime()))
open(index, 'w').write(src.encode('utf8'))
return index
def get_contents():
''' Parse Newsweek RSS feeds to get links to all articles'''
def nstounicode(ns):
return unicode(str(ns), 'utf8')
def fix_link(link):
if '?' in link:
link = link[:link.index('?')]
return link + 'print/1/displaymode/1098/'
def process_description(tag):
src = '\n'.join(tag.contents)
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
for e in replaced_entities:
ent = '&'+e+';'
src = src.replace(ent, unichr(name2codepoint[e]))
return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
pages = []
for title, url in RSS_FEEDS:
soup = BeautifulStoneSoup(urllib2.urlopen(url))
contents = []
for item in soup.findAll('item'):
d = {
'title' : nstounicode(item.title.contents[0]),
'description': process_description(item.description),
'link': fix_link(nstounicode(item.guid.contents[0]))
}
if '&lt;' in d['description']:
d['description'] = d['description'][:d['description'].index('&lt;')]
contents.append(d)
pages.append((title, contents))
return pages
def initialize(profile):
print 'Fetching feeds...',
sys.stdout.flush()
contents = get_contents()
print 'done'
index = create_aggregator(contents)
profile['url'] = 'file://'+index
def finalize(profile):
global _tdir
shutil.rmtree(_tdir)

View File

@ -13,9 +13,11 @@
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''Profiles for known websites.'''
import time, re
from libprs500.ebooks.lrf.web.newsweek import initialize as newsweek_initialize
from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize
profiles = {
'default' : {
'url' : '', # The URL of the website
@ -24,6 +26,7 @@ profiles = {
'max_files' : 1000, # Maximum number of files to download
'delay' : 0, # Delay between consecutive downloads
'timeout' : 10, # Timeout for fetching files from server
'timefmt' : ' [%a %d %b %Y]',
'no_stylesheets' : False, # Download stylesheets
'match_regexps' : [], # List of regular expressions that determines which links to follow
'filter_regexps' : [], # List of regular expressions that determines which links to ignore
@ -78,7 +81,37 @@ profiles = {
'<style type="text/css">.headline {font-size: x-large;}</style>'),
]
],
},
},
'newsweek' : {
'initialize' : newsweek_initialize,
'finalize' : newsweek_finalize,
'title' : 'Newsweek',
'timefmt' : ' [%d %b %Y]',
'no_stylesheets' : True,
'max_recursions' : 2,
'preprocess_regexps' :
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
# Make fonts larger
(r'<style.*?\.copyright.*?</style>',
lambda match : \
'''<style type="text/css">'''
'''updateTime{font:small Arial;color:#000000;}'''
'''.credit{font:small Arial;color:#999999;}'''
'''.head{font:bold 18pt x-large;color:#CC0000;}'''
'''.abstract{font:14pt large Verdana;color:#000000;}'''
'''.title{font:bold;color:#000000;}'''
'''.source{font:bold small Verdana;color:#CC0000;}'''
'''.footerLink{font:bold Verdana;color:#000000;}'''
'''.caption{font: Verdana;color:#000000;}'''
'''.textBodyBlack, .copyright{font: Verdana;color:#000000;}'''
'''.copyright{font-style:italic;}'''
'''</style>'''
),
]
],
},
}
for key in profiles.keys():