mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Implemented newsweek profile.
This commit is contained in:
parent
5e8fdf7b2b
commit
e07305bcac
@ -13,7 +13,7 @@
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
''' E-book management software'''
|
||||
__version__ = "0.3.70"
|
||||
__version__ = "0.3.71"
|
||||
__docformat__ = "epytext"
|
||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||
__appname__ = 'libprs500'
|
||||
|
@ -61,8 +61,7 @@ def option_parser():
|
||||
|
||||
def fetch_website(options):
|
||||
tdir = tempfile.mkdtemp(prefix=__appname__+'_' )
|
||||
options.dir = tdir
|
||||
web2disk_setup_logger(options)
|
||||
options.dir = tdir
|
||||
fetcher = create_fetcher(options)
|
||||
fetcher.preprocess_regexps = options.preprocess_regexps
|
||||
return fetcher.start_fetch(options.url), tdir
|
||||
@ -77,6 +76,7 @@ def create_lrf(htmlfile, options):
|
||||
def main(args=sys.argv):
|
||||
parser = option_parser()
|
||||
options, args = parser.parse_args(args)
|
||||
web2disk_setup_logger(options)
|
||||
if len(args) > 2:
|
||||
parser.print_help()
|
||||
return 1
|
||||
@ -87,6 +87,9 @@ def main(args=sys.argv):
|
||||
return 1
|
||||
profile = profiles[args[1]] if len(args) == 2 else profiles['default']
|
||||
|
||||
if profile.has_key('initialize'):
|
||||
profile['initialize'](profile)
|
||||
|
||||
for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
|
||||
val = getattr(options, opt)
|
||||
if val is None:
|
||||
@ -103,7 +106,7 @@ def main(args=sys.argv):
|
||||
title = profile['title']
|
||||
if not title:
|
||||
title = urlsplit(options.url).netloc
|
||||
options.title = title + time.strftime(' [%a %d %b %Y]', time.localtime())
|
||||
options.title = title + time.strftime(profile['timefmt'], time.localtime())
|
||||
|
||||
options.match_regexps += profile['match_regexps']
|
||||
options.preprocess_regexps = profile['preprocess_regexps']
|
||||
@ -111,6 +114,8 @@ def main(args=sys.argv):
|
||||
|
||||
htmlfile, tdir = fetch_website(options)
|
||||
create_lrf(htmlfile, options)
|
||||
if profile.has_key('finalize'):
|
||||
profile['finalize'](profile)
|
||||
shutil.rmtree(tdir)
|
||||
|
||||
return 0
|
||||
|
141
src/libprs500/ebooks/lrf/web/newsweek.py
Normal file
141
src/libprs500/ebooks/lrf/web/newsweek.py
Normal file
@ -0,0 +1,141 @@
|
||||
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
'''Logic to create a Newsweek HTML aggregator from RSS feeds'''
|
||||
|
||||
import sys, urllib2, time, re, tempfile, os, shutil
|
||||
|
||||
from libprs500 import __appname__
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
from htmlentitydefs import name2codepoint
|
||||
|
||||
RSS_FEEDS = [
|
||||
('Cover Story', 'http://feeds.newsweek.com/CoverStory'),
|
||||
('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'),
|
||||
('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
|
||||
('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
|
||||
('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
|
||||
('Health', 'http://feeds.newsweek.com/sections/health'),
|
||||
('Society', 'http://feeds.newsweek.com/newsweek/society'),
|
||||
('Business', 'http://feeds.newsweek.com/newsweek/business'),
|
||||
('Science and Technology', 'http://feeds.newsweek.com/newsweek/TechnologyScience'),
|
||||
('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
|
||||
('Tip Sheet', 'http://feeds.newsweek.com/newsweek/TipSheet/Highlights'),
|
||||
]
|
||||
|
||||
BASE_TEMPLATE=\
|
||||
u'''
|
||||
<html>
|
||||
<body>
|
||||
<h1>Newsweek</h1>
|
||||
<b align="right">%(date)s</b>
|
||||
<p></p>
|
||||
<h2>Table of Contents</h2>
|
||||
<ul>
|
||||
%(toc)s
|
||||
</ul>
|
||||
<br />
|
||||
<hr />
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
SECTION_TEMPLATE=\
|
||||
u'''
|
||||
<html>
|
||||
<body>
|
||||
<h2>%(title)s</h2>
|
||||
<p></p>
|
||||
<h2>Table of Contents</h2>
|
||||
<ul>
|
||||
%(toc)s
|
||||
</ul>
|
||||
<br />
|
||||
<hr />
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
_tdir = None
|
||||
def create_aggregator(sections):
|
||||
'''Return aggregator HTML encoded in utf8'''
|
||||
toc, sec = u'', 0
|
||||
global _tdir
|
||||
_tdir = tempfile.mkdtemp(prefix=__appname__)
|
||||
for section in sections:
|
||||
sec += 1
|
||||
secfile = os.path.join(_tdir, 'sec%d.html'%(sec,))
|
||||
title, contents = section
|
||||
toc += '<li><a href="%s">%s</a></li>\n'%(secfile, title,)
|
||||
stoc = u''
|
||||
for item in contents:
|
||||
desc = item['description'].strip()
|
||||
stoc += '<li><a href="%(link)s">%(title)s</a><br />'%dict(link=item['link'], title=item['title'])
|
||||
if desc:
|
||||
stoc += '<div style="font-size:small; font-family:sans">%s</div>\n'%(desc,)
|
||||
stoc += '</li>\n'
|
||||
section = SECTION_TEMPLATE%dict(title=title, toc=stoc)
|
||||
open(secfile, 'w').write(section.encode('utf8'))
|
||||
index = os.path.join(_tdir, 'index.html')
|
||||
src = BASE_TEMPLATE % dict(toc=toc, date=time.strftime('%d %B, %Y', time.localtime()))
|
||||
open(index, 'w').write(src.encode('utf8'))
|
||||
return index
|
||||
|
||||
def get_contents():
|
||||
''' Parse Newsweek RSS feeds to get links to all articles'''
|
||||
|
||||
def nstounicode(ns):
|
||||
return unicode(str(ns), 'utf8')
|
||||
|
||||
def fix_link(link):
|
||||
if '?' in link:
|
||||
link = link[:link.index('?')]
|
||||
return link + 'print/1/displaymode/1098/'
|
||||
|
||||
def process_description(tag):
|
||||
src = '\n'.join(tag.contents)
|
||||
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
|
||||
for e in replaced_entities:
|
||||
ent = '&'+e+';'
|
||||
src = src.replace(ent, unichr(name2codepoint[e]))
|
||||
return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
|
||||
|
||||
pages = []
|
||||
for title, url in RSS_FEEDS:
|
||||
soup = BeautifulStoneSoup(urllib2.urlopen(url))
|
||||
contents = []
|
||||
for item in soup.findAll('item'):
|
||||
d = {
|
||||
'title' : nstounicode(item.title.contents[0]),
|
||||
'description': process_description(item.description),
|
||||
'link': fix_link(nstounicode(item.guid.contents[0]))
|
||||
}
|
||||
if '<' in d['description']:
|
||||
d['description'] = d['description'][:d['description'].index('<')]
|
||||
contents.append(d)
|
||||
pages.append((title, contents))
|
||||
return pages
|
||||
|
||||
|
||||
def initialize(profile):
|
||||
print 'Fetching feeds...',
|
||||
sys.stdout.flush()
|
||||
contents = get_contents()
|
||||
print 'done'
|
||||
index = create_aggregator(contents)
|
||||
profile['url'] = 'file://'+index
|
||||
|
||||
def finalize(profile):
|
||||
global _tdir
|
||||
shutil.rmtree(_tdir)
|
@ -13,9 +13,11 @@
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
'''Profiles for known websites.'''
|
||||
|
||||
import time, re
|
||||
|
||||
from libprs500.ebooks.lrf.web.newsweek import initialize as newsweek_initialize
|
||||
from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize
|
||||
|
||||
profiles = {
|
||||
'default' : {
|
||||
'url' : '', # The URL of the website
|
||||
@ -24,6 +26,7 @@ profiles = {
|
||||
'max_files' : 1000, # Maximum number of files to download
|
||||
'delay' : 0, # Delay between consecutive downloads
|
||||
'timeout' : 10, # Timeout for fetching files from server
|
||||
'timefmt' : ' [%a %d %b %Y]',
|
||||
'no_stylesheets' : False, # Download stylesheets
|
||||
'match_regexps' : [], # List of regular expressions that determines which links to follow
|
||||
'filter_regexps' : [], # List of regular expressions that determines which links to ignore
|
||||
@ -78,7 +81,37 @@ profiles = {
|
||||
'<style type="text/css">.headline {font-size: x-large;}</style>'),
|
||||
]
|
||||
],
|
||||
},
|
||||
},
|
||||
|
||||
'newsweek' : {
|
||||
'initialize' : newsweek_initialize,
|
||||
'finalize' : newsweek_finalize,
|
||||
'title' : 'Newsweek',
|
||||
'timefmt' : ' [%d %b %Y]',
|
||||
'no_stylesheets' : True,
|
||||
'max_recursions' : 2,
|
||||
'preprocess_regexps' :
|
||||
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
# Make fonts larger
|
||||
(r'<style.*?\.copyright.*?</style>',
|
||||
lambda match : \
|
||||
'''<style type="text/css">'''
|
||||
'''updateTime{font:small Arial;color:#000000;}'''
|
||||
'''.credit{font:small Arial;color:#999999;}'''
|
||||
'''.head{font:bold 18pt x-large;color:#CC0000;}'''
|
||||
'''.abstract{font:14pt large Verdana;color:#000000;}'''
|
||||
'''.title{font:bold;color:#000000;}'''
|
||||
'''.source{font:bold small Verdana;color:#CC0000;}'''
|
||||
'''.footerLink{font:bold Verdana;color:#000000;}'''
|
||||
'''.caption{font: Verdana;color:#000000;}'''
|
||||
'''.textBodyBlack, .copyright{font: Verdana;color:#000000;}'''
|
||||
'''.copyright{font-style:italic;}'''
|
||||
'''</style>'''
|
||||
),
|
||||
]
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
for key in profiles.keys():
|
||||
|
Loading…
x
Reference in New Issue
Block a user