mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-30 23:00:21 -04:00
Improved nytimes profile.
This commit is contained in:
parent
4ecab43cd0
commit
0c95bc3d6d
@ -39,6 +39,10 @@ def option_parser():
|
||||
parser.add_option('-u', '--url', dest='url', default=None,
|
||||
help='The URL to download. You only need to specify this if you are not specifying a website_profile.')
|
||||
|
||||
parser.add_option('--username', dest='username', default=None,
|
||||
help='Specify the username to be used while downloading. Only used if the profile supports it.')
|
||||
parser.add_option('--password', dest='password', default=None,
|
||||
help='Specify the password to be used while downloading. Only used if the profile supports it.')
|
||||
parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %default s',
|
||||
default=None, type='int', dest='timeout')
|
||||
parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %default',
|
||||
@ -64,7 +68,7 @@ def fetch_website(options, logger):
|
||||
return fetcher.start_fetch(options.url), tdir
|
||||
|
||||
def create_lrf(htmlfile, options, logger):
|
||||
if not options.author:
|
||||
if not options.author or options.author.lower() == 'unknown':
|
||||
options.author = __appname__
|
||||
options.header = True
|
||||
if options.output:
|
||||
@ -83,9 +87,12 @@ def process_profile(args, options, logger=None):
|
||||
if not profiles.has_key(args[1]):
|
||||
raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys()))
|
||||
profile = profiles[args[1]] if len(args) == 2 else profiles['default']
|
||||
|
||||
profile['username'] = options.username
|
||||
profile['password'] = options.password
|
||||
if profile.has_key('initialize'):
|
||||
profile['initialize'](profile)
|
||||
if profile.has_key('browser'):
|
||||
options.browser = profile['browser']
|
||||
|
||||
for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
|
||||
val = getattr(options, opt)
|
||||
@ -104,12 +111,15 @@ def process_profile(args, options, logger=None):
|
||||
options.match_regexps += profile['match_regexps']
|
||||
options.preprocess_regexps = profile['preprocess_regexps']
|
||||
options.filter_regexps += profile['filter_regexps']
|
||||
if len(args) == 2 and args[1] != 'default':
|
||||
options.anchor_ids = False
|
||||
|
||||
htmlfile, tdir = fetch_website(options, logger)
|
||||
create_lrf(htmlfile, options, logger)
|
||||
if profile.has_key('finalize'):
|
||||
profile['finalize'](profile)
|
||||
shutil.rmtree(tdir)
|
||||
|
||||
|
||||
def main(args=sys.argv, logger=None):
|
||||
parser = option_parser()
|
||||
|
146
src/libprs500/ebooks/lrf/web/nytimes.py
Normal file
146
src/libprs500/ebooks/lrf/web/nytimes.py
Normal file
@ -0,0 +1,146 @@
|
||||
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
'''New York Times from RSS feeds.'''
|
||||
import time, tempfile, os, shutil, calendar, operator
|
||||
|
||||
from libprs500 import __appname__, iswindows, browser
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
|
||||
|
||||
RSS = 'http://www.nytimes.com/services/xml/rss/index.html'
|
||||
LOGIN = 'http://www.nytimes.com/auth/login'
|
||||
|
||||
def get_feeds(browser):
|
||||
src = browser.open(RSS).read()
|
||||
soup = BeautifulSoup(src[src.index('<html'):])
|
||||
feeds = []
|
||||
for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
|
||||
if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts',
|
||||
'Dining & Wine', 'Home & Garden', 'Magazine',
|
||||
'Most E-mailed Articles',
|
||||
'Automobiles', 'Fashion & Style', 'Television News',
|
||||
'Education']:
|
||||
feeds.append((link['title'], link['href']))
|
||||
#else: print link['title']
|
||||
|
||||
return feeds
|
||||
|
||||
def parse_feeds(feeds, browser, max_articles_per_feed=10):
|
||||
articles = {}
|
||||
for title, url in feeds:
|
||||
src = browser.open(url).read()
|
||||
articles[title] = []
|
||||
soup = BeautifulStoneSoup(src)
|
||||
for item in soup.findAll('item'):
|
||||
try:
|
||||
pubdate = item.find('pubdate').string
|
||||
if not pubdate:
|
||||
continue
|
||||
pubdate = pubdate.replace('+0000', 'GMT')
|
||||
d = {
|
||||
'title' : item.find('title').string,
|
||||
'url' : item.find('guid').string+'?&pagewanted=print',
|
||||
'timestamp': calendar.timegm(time.strptime(pubdate,
|
||||
'%a, %d %b %Y %H:%M:%S %Z')),
|
||||
'date' : pubdate
|
||||
}
|
||||
except:
|
||||
continue
|
||||
try:
|
||||
d['description'] = item.find('description').string
|
||||
except:
|
||||
d['description'] = ''
|
||||
articles[title].append(d)
|
||||
articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
|
||||
articles[title][max_articles_per_feed:] = []
|
||||
for item in articles[title]:
|
||||
item.pop('timestamp')
|
||||
return articles
|
||||
|
||||
def build_index(articles, dir):
|
||||
|
||||
def build_sub_index(title, items):
|
||||
ilist = ''
|
||||
li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
|
||||
u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
|
||||
for item in items:
|
||||
ilist += li%item
|
||||
return u'''\
|
||||
<html>
|
||||
<body>
|
||||
<h2>%(title)s</h2>
|
||||
<ul>
|
||||
%(items)s
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
'''%dict(title=title, items=ilist.rstrip())
|
||||
|
||||
cnum = 0
|
||||
clist = ''
|
||||
categories = articles.keys()
|
||||
categories.sort()
|
||||
for category in categories:
|
||||
cnum += 1
|
||||
cfile = os.path.join(dir, 'category'+str(cnum)+'.html')
|
||||
prefix = 'file:' if iswindows else ''
|
||||
clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
|
||||
src = build_sub_index(category, articles[category])
|
||||
open(cfile, 'wb').write(src.encode('utf-8'))
|
||||
|
||||
src = '''\
|
||||
<html>
|
||||
<body>
|
||||
<h1>The New York Times</h1>
|
||||
<div style='text-align: right; font-weight: bold'>%(date)s</div>
|
||||
<ul>
|
||||
%(categories)s
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
'''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), categories=clist)
|
||||
index = os.path.join(dir, 'index.html')
|
||||
open(index, 'wb').write(src.encode('utf-8'))
|
||||
return index
|
||||
|
||||
|
||||
def initialize(profile):
|
||||
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
|
||||
profile['browser'] = login(profile)
|
||||
feeds = get_feeds(profile['browser'])
|
||||
articles = parse_feeds(feeds, profile['browser'])
|
||||
index = build_index(articles, profile['temp dir'])
|
||||
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
|
||||
|
||||
|
||||
def finalize(profile):
|
||||
shutil.rmtree(profile['temp dir'])
|
||||
|
||||
|
||||
def login(profile):
|
||||
br = browser()
|
||||
if profile['username'] and profile['password']:
|
||||
br.open(LOGIN)
|
||||
br.select_form(name='login')
|
||||
br['USERID'] = profile['username']
|
||||
br['PASSWORD'] = profile['password']
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
feeds = get_feeds()
|
||||
articles = parse_feeds(feeds)
|
||||
print articles
|
||||
|
@ -17,6 +17,9 @@ import time, re
|
||||
|
||||
from libprs500.ebooks.lrf.web.newsweek import initialize as newsweek_initialize
|
||||
from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize
|
||||
from libprs500.ebooks.lrf.web.nytimes import initialize as nytimes_initialize
|
||||
from libprs500.ebooks.lrf.web.nytimes import finalize as nytimes_finalize
|
||||
|
||||
|
||||
profiles = {
|
||||
'default' : {
|
||||
@ -37,32 +40,25 @@ profiles = {
|
||||
},
|
||||
|
||||
'nytimes' : {
|
||||
'url' : 'http://nytimesriver.com',
|
||||
'initialize' : nytimes_initialize,
|
||||
'finalize' : nytimes_finalize,
|
||||
'timefmt' : ' [%a, %d %b, %Y]',
|
||||
'max_recursions' : 2,
|
||||
'title' : 'The New York Times',
|
||||
'match_regexps' : 'nytimes.com/'+time.strftime('%Y', time.localtime()),
|
||||
'preprocess_regexps' :
|
||||
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
# Remove help link and replace by title
|
||||
(r'<a .*?alt=.Click here for information about this service.*?</a>',
|
||||
lambda match: '<h1>The New York Times</h1>\n<p align="right"><b>%s</b></p>'%(time.strftime('%a %d %b %Y', time.localtime()),)),
|
||||
# Blank line before categories
|
||||
(r'<b>\s*NYT', lambda match: '<p></p><b>NYT'),
|
||||
# Blank line between articles
|
||||
(r'<p><a href', lambda match : '<br /><p><a href'),
|
||||
# Remove header on individual articles
|
||||
(r'<body class=.printerversion..*?<h1><nyt_headline',
|
||||
lambda match : '<body class="printerversion">\n<h1><nyt_headline'),
|
||||
# Remove footer from individiual articles
|
||||
(r'<nyt_update_bottom.*', lambda match : '</body></html>'),
|
||||
# Remove TimesSelect garbage
|
||||
(r'<title>.*?TimesSelect', lambda match : 'Downloading of TimesSelect stories is not supported.<!--'),
|
||||
# Remove header bar
|
||||
(r'(<body.*?>).*?<h1', lambda match: match.group(1)+'<h1'),
|
||||
(r'<div class="articleTools">.*></ul>', lambda match : ''),
|
||||
# Remove footer bar
|
||||
(r'<\!-- end \#article -->.*', lambda match : '</body></html>'),
|
||||
(r'<div id="footer">.*', lambda match : '</body></html>'),
|
||||
]
|
||||
],
|
||||
},
|
||||
|
||||
'bbc' : {
|
||||
'url' : 'http://bbcriver.com',
|
||||
'title' : 'The BBC',
|
||||
'no_stylesheets' : True,
|
||||
'preprocess_regexps' :
|
||||
|
Loading…
x
Reference in New Issue
Block a user