mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
New BBC profile.
This commit is contained in:
parent
ef0ec5bd71
commit
684d03da1f
@ -12,3 +12,88 @@
|
|||||||
## You should have received a copy of the GNU General Public License along
|
## You should have received a copy of the GNU General Public License along
|
||||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
import os, time, calendar, operator
|
||||||
|
|
||||||
|
from libprs500 import iswindows
|
||||||
|
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||||
|
|
||||||
|
def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10):
|
||||||
|
articles = {}
|
||||||
|
for title, url in feeds:
|
||||||
|
src = browser.open(url).read()
|
||||||
|
articles[title] = []
|
||||||
|
soup = BeautifulStoneSoup(src)
|
||||||
|
for item in soup.findAll('item'):
|
||||||
|
try:
|
||||||
|
pubdate = item.find('pubdate').string
|
||||||
|
if not pubdate:
|
||||||
|
continue
|
||||||
|
pubdate = pubdate.replace('+0000', 'GMT')
|
||||||
|
d = {
|
||||||
|
'title' : item.find('title').string,
|
||||||
|
'url' : print_version(item.find('guid').string),
|
||||||
|
'timestamp': calendar.timegm(time.strptime(pubdate,
|
||||||
|
'%a, %d %b %Y %H:%M:%S %Z')),
|
||||||
|
'date' : pubdate
|
||||||
|
}
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
d['description'] = item.find('description').string
|
||||||
|
except:
|
||||||
|
d['description'] = ''
|
||||||
|
articles[title].append(d)
|
||||||
|
articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
|
||||||
|
articles[title][max_articles_per_feed:] = []
|
||||||
|
for item in articles[title]:
|
||||||
|
item.pop('timestamp')
|
||||||
|
return articles
|
||||||
|
|
||||||
|
|
||||||
|
def build_index(title, articles, dir):
|
||||||
|
'''Build an RSS based index.html'''
|
||||||
|
|
||||||
|
def build_sub_index(title, items):
|
||||||
|
ilist = ''
|
||||||
|
li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
|
||||||
|
u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
|
||||||
|
for item in items:
|
||||||
|
ilist += li%item
|
||||||
|
return u'''\
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h2>%(title)s</h2>
|
||||||
|
<ul>
|
||||||
|
%(items)s
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
'''%dict(title=title, items=ilist.rstrip())
|
||||||
|
|
||||||
|
cnum = 0
|
||||||
|
clist = ''
|
||||||
|
categories = articles.keys()
|
||||||
|
categories.sort()
|
||||||
|
for category in categories:
|
||||||
|
cnum += 1
|
||||||
|
cfile = os.path.join(dir, 'category'+str(cnum)+'.html')
|
||||||
|
prefix = 'file:' if iswindows else ''
|
||||||
|
clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
|
||||||
|
src = build_sub_index(category, articles[category])
|
||||||
|
open(cfile, 'wb').write(src.encode('utf-8'))
|
||||||
|
|
||||||
|
src = '''\
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>The New York Times</h1>
|
||||||
|
<div style='text-align: right; font-weight: bold'>%(date)s</div>
|
||||||
|
<ul>
|
||||||
|
%(categories)s
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
'''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), categories=clist)
|
||||||
|
index = os.path.join(dir, 'index.html')
|
||||||
|
open(index, 'wb').write(src.encode('utf-8'))
|
||||||
|
return index
|
||||||
|
53
src/libprs500/ebooks/lrf/web/bbc.py
Normal file
53
src/libprs500/ebooks/lrf/web/bbc.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
|
||||||
|
## This program is free software; you can redistribute it and/or modify
|
||||||
|
## it under the terms of the GNU General Public License as published by
|
||||||
|
## the Free Software Foundation; either version 2 of the License, or
|
||||||
|
## (at your option) any later version.
|
||||||
|
##
|
||||||
|
## This program is distributed in the hope that it will be useful,
|
||||||
|
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
## GNU General Public License for more details.
|
||||||
|
##
|
||||||
|
## You should have received a copy of the GNU General Public License along
|
||||||
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
|
||||||
|
import tempfile, shutil, os
|
||||||
|
from libprs500.ebooks.lrf.web import build_index, parse_feeds
|
||||||
|
|
||||||
|
RSS = 'http://news.bbc.co.uk/1/hi/help/3223484.stm'
|
||||||
|
|
||||||
|
from libprs500 import __appname__, iswindows, browser
|
||||||
|
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
def get_feeds(browser):
|
||||||
|
src = browser.open(RSS).read()
|
||||||
|
soup = BeautifulSoup(src[src.index('<html'):])
|
||||||
|
feeds = []
|
||||||
|
ul = soup.find('ul', attrs={'class':'rss'})
|
||||||
|
for link in ul.findAll('a'):
|
||||||
|
feeds.append((link.string, link['href']))
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def initialize(profile):
|
||||||
|
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
|
||||||
|
profile['browser'] = browser()
|
||||||
|
feeds = get_feeds(profile['browser'])
|
||||||
|
articles = parse_feeds(feeds, profile['browser'], lambda x: x.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/'))
|
||||||
|
index = build_index('The BBC', articles, profile['temp dir'])
|
||||||
|
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
|
||||||
|
profile['timefmt'] = ' [%a, %d %b, %Y]'
|
||||||
|
profile['max_recursions'] = 2
|
||||||
|
profile['title'] = 'The BBC'
|
||||||
|
profile['no_stylesheets'] = True
|
||||||
|
|
||||||
|
def finalize(profile):
|
||||||
|
if os.path.isdir(profile['temp dir']):
|
||||||
|
shutil.rmtree(profile['temp dir'])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -79,46 +79,53 @@ def create_lrf(htmlfile, options, logger):
|
|||||||
process_file(htmlfile, options, logger)
|
process_file(htmlfile, options, logger)
|
||||||
|
|
||||||
def process_profile(args, options, logger=None):
|
def process_profile(args, options, logger=None):
|
||||||
if logger is None:
|
tdir = None
|
||||||
level = logging.DEBUG if options.verbose else logging.INFO
|
try:
|
||||||
logger = logging.getLogger('web2lrf')
|
if logger is None:
|
||||||
setup_cli_handlers(logger, level)
|
level = logging.DEBUG if options.verbose else logging.INFO
|
||||||
if len(args) == 2:
|
logger = logging.getLogger('web2lrf')
|
||||||
if not profiles.has_key(args[1]):
|
setup_cli_handlers(logger, level)
|
||||||
raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys()))
|
if len(args) == 2:
|
||||||
profile = profiles[args[1]] if len(args) == 2 else profiles['default']
|
if not profiles.has_key(args[1]):
|
||||||
profile['username'] = options.username
|
raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys()))
|
||||||
profile['password'] = options.password
|
profile = profiles[args[1]] if len(args) == 2 else profiles['default']
|
||||||
if profile.has_key('initialize'):
|
profile['username'] = options.username
|
||||||
profile['initialize'](profile)
|
profile['password'] = options.password
|
||||||
if profile.has_key('browser'):
|
if profile.has_key('initialize'):
|
||||||
options.browser = profile['browser']
|
profile['initialize'](profile)
|
||||||
|
if profile.has_key('browser'):
|
||||||
for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
|
options.browser = profile['browser']
|
||||||
val = getattr(options, opt)
|
|
||||||
if val is None:
|
|
||||||
setattr(options, opt, profile[opt])
|
|
||||||
|
|
||||||
if not options.url:
|
for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
|
||||||
raise CommandLineError('You must specify the --url option or a profile from one of: %s', available_profiles)
|
val = getattr(options, opt)
|
||||||
|
if val is None:
|
||||||
if not options.title:
|
setattr(options, opt, profile[opt])
|
||||||
title = profile['title']
|
|
||||||
if not title:
|
if not options.url:
|
||||||
title = urlsplit(options.url).netloc
|
options.url = profile['url']
|
||||||
options.title = title + time.strftime(profile['timefmt'], time.localtime())
|
|
||||||
|
if not options.url:
|
||||||
options.match_regexps += profile['match_regexps']
|
raise CommandLineError('You must specify the --url option or a profile from one of: %s'%(available_profiles,))
|
||||||
options.preprocess_regexps = profile['preprocess_regexps']
|
|
||||||
options.filter_regexps += profile['filter_regexps']
|
if not options.title:
|
||||||
if len(args) == 2 and args[1] != 'default':
|
title = profile['title']
|
||||||
options.anchor_ids = False
|
if not title:
|
||||||
|
title = urlsplit(options.url).netloc
|
||||||
htmlfile, tdir = fetch_website(options, logger)
|
options.title = title + time.strftime(profile['timefmt'], time.localtime())
|
||||||
create_lrf(htmlfile, options, logger)
|
|
||||||
if profile.has_key('finalize'):
|
options.match_regexps += profile['match_regexps']
|
||||||
profile['finalize'](profile)
|
options.preprocess_regexps = profile['preprocess_regexps']
|
||||||
shutil.rmtree(tdir)
|
options.filter_regexps += profile['filter_regexps']
|
||||||
|
if len(args) == 2 and args[1] != 'default':
|
||||||
|
options.anchor_ids = False
|
||||||
|
|
||||||
|
htmlfile, tdir = fetch_website(options, logger)
|
||||||
|
create_lrf(htmlfile, options, logger)
|
||||||
|
finally:
|
||||||
|
if profile.has_key('finalize'):
|
||||||
|
profile['finalize'](profile)
|
||||||
|
if tdir and os.path.isdir(tdir):
|
||||||
|
shutil.rmtree(tdir)
|
||||||
|
|
||||||
|
|
||||||
def main(args=sys.argv, logger=None):
|
def main(args=sys.argv, logger=None):
|
||||||
|
@ -13,10 +13,11 @@
|
|||||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
'''New York Times from RSS feeds.'''
|
'''New York Times from RSS feeds.'''
|
||||||
import time, tempfile, os, shutil, calendar, operator
|
import os, tempfile, shutil
|
||||||
|
|
||||||
from libprs500 import __appname__, iswindows, browser
|
from libprs500 import __appname__, iswindows, browser
|
||||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
|
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
from libprs500.ebooks.lrf.web import build_index, parse_feeds
|
||||||
|
|
||||||
RSS = 'http://www.nytimes.com/services/xml/rss/index.html'
|
RSS = 'http://www.nytimes.com/services/xml/rss/index.html'
|
||||||
LOGIN = 'http://www.nytimes.com/auth/login'
|
LOGIN = 'http://www.nytimes.com/auth/login'
|
||||||
@ -36,96 +37,21 @@ def get_feeds(browser):
|
|||||||
|
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def parse_feeds(feeds, browser, max_articles_per_feed=10):
|
|
||||||
articles = {}
|
|
||||||
for title, url in feeds:
|
|
||||||
src = browser.open(url).read()
|
|
||||||
articles[title] = []
|
|
||||||
soup = BeautifulStoneSoup(src)
|
|
||||||
for item in soup.findAll('item'):
|
|
||||||
try:
|
|
||||||
pubdate = item.find('pubdate').string
|
|
||||||
if not pubdate:
|
|
||||||
continue
|
|
||||||
pubdate = pubdate.replace('+0000', 'GMT')
|
|
||||||
d = {
|
|
||||||
'title' : item.find('title').string,
|
|
||||||
'url' : item.find('guid').string+'?&pagewanted=print',
|
|
||||||
'timestamp': calendar.timegm(time.strptime(pubdate,
|
|
||||||
'%a, %d %b %Y %H:%M:%S %Z')),
|
|
||||||
'date' : pubdate
|
|
||||||
}
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
d['description'] = item.find('description').string
|
|
||||||
except:
|
|
||||||
d['description'] = ''
|
|
||||||
articles[title].append(d)
|
|
||||||
articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
|
|
||||||
articles[title][max_articles_per_feed:] = []
|
|
||||||
for item in articles[title]:
|
|
||||||
item.pop('timestamp')
|
|
||||||
return articles
|
|
||||||
|
|
||||||
def build_index(articles, dir):
|
|
||||||
|
|
||||||
def build_sub_index(title, items):
|
|
||||||
ilist = ''
|
|
||||||
li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
|
|
||||||
u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
|
|
||||||
for item in items:
|
|
||||||
ilist += li%item
|
|
||||||
return u'''\
|
|
||||||
<html>
|
|
||||||
<body>
|
|
||||||
<h2>%(title)s</h2>
|
|
||||||
<ul>
|
|
||||||
%(items)s
|
|
||||||
</ul>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
'''%dict(title=title, items=ilist.rstrip())
|
|
||||||
|
|
||||||
cnum = 0
|
|
||||||
clist = ''
|
|
||||||
categories = articles.keys()
|
|
||||||
categories.sort()
|
|
||||||
for category in categories:
|
|
||||||
cnum += 1
|
|
||||||
cfile = os.path.join(dir, 'category'+str(cnum)+'.html')
|
|
||||||
prefix = 'file:' if iswindows else ''
|
|
||||||
clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
|
|
||||||
src = build_sub_index(category, articles[category])
|
|
||||||
open(cfile, 'wb').write(src.encode('utf-8'))
|
|
||||||
|
|
||||||
src = '''\
|
|
||||||
<html>
|
|
||||||
<body>
|
|
||||||
<h1>The New York Times</h1>
|
|
||||||
<div style='text-align: right; font-weight: bold'>%(date)s</div>
|
|
||||||
<ul>
|
|
||||||
%(categories)s
|
|
||||||
</ul>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
'''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), categories=clist)
|
|
||||||
index = os.path.join(dir, 'index.html')
|
|
||||||
open(index, 'wb').write(src.encode('utf-8'))
|
|
||||||
return index
|
|
||||||
|
|
||||||
|
|
||||||
def initialize(profile):
|
def initialize(profile):
|
||||||
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
|
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
|
||||||
profile['browser'] = login(profile)
|
profile['browser'] = login(profile)
|
||||||
feeds = get_feeds(profile['browser'])
|
feeds = get_feeds(profile['browser'])
|
||||||
articles = parse_feeds(feeds, profile['browser'])
|
articles = parse_feeds(feeds, profile['browser'], lambda x: x + '?&pagewanted=print')
|
||||||
index = build_index(articles, profile['temp dir'])
|
index = build_index('The New York Times', articles, profile['temp dir'])
|
||||||
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
|
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
|
||||||
|
profile['timefmt'] = ' [%a, %d %b, %Y]'
|
||||||
|
profile['max_recursions'] = 2
|
||||||
|
profile['title'] = 'The New York Times'
|
||||||
|
|
||||||
|
|
||||||
def finalize(profile):
|
def finalize(profile):
|
||||||
shutil.rmtree(profile['temp dir'])
|
if os.path.isdir(profile['temp dir']):
|
||||||
|
shutil.rmtree(profile['temp dir'])
|
||||||
|
|
||||||
|
|
||||||
def login(profile):
|
def login(profile):
|
||||||
|
@ -19,6 +19,8 @@ from libprs500.ebooks.lrf.web.newsweek import initialize as newsweek_initialize
|
|||||||
from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize
|
from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize
|
||||||
from libprs500.ebooks.lrf.web.nytimes import initialize as nytimes_initialize
|
from libprs500.ebooks.lrf.web.nytimes import initialize as nytimes_initialize
|
||||||
from libprs500.ebooks.lrf.web.nytimes import finalize as nytimes_finalize
|
from libprs500.ebooks.lrf.web.nytimes import finalize as nytimes_finalize
|
||||||
|
from libprs500.ebooks.lrf.web.bbc import initialize as bbc_initialize
|
||||||
|
from libprs500.ebooks.lrf.web.bbc import finalize as bbc_finalize
|
||||||
|
|
||||||
|
|
||||||
profiles = {
|
profiles = {
|
||||||
@ -42,9 +44,7 @@ profiles = {
|
|||||||
'nytimes' : {
|
'nytimes' : {
|
||||||
'initialize' : nytimes_initialize,
|
'initialize' : nytimes_initialize,
|
||||||
'finalize' : nytimes_finalize,
|
'finalize' : nytimes_finalize,
|
||||||
'timefmt' : ' [%a, %d %b, %Y]',
|
|
||||||
'max_recursions' : 2,
|
|
||||||
'title' : 'The New York Times',
|
|
||||||
'preprocess_regexps' :
|
'preprocess_regexps' :
|
||||||
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||||
[
|
[
|
||||||
@ -59,26 +59,24 @@ profiles = {
|
|||||||
},
|
},
|
||||||
|
|
||||||
'bbc' : {
|
'bbc' : {
|
||||||
'title' : 'The BBC',
|
'initialize' : bbc_initialize,
|
||||||
'no_stylesheets' : True,
|
'finalize' : bbc_finalize,
|
||||||
'preprocess_regexps' :
|
'preprocess_regexps' :
|
||||||
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||||
[
|
[
|
||||||
# Remove help link and replace by title
|
|
||||||
(r'<a .*?alt=.Click here for information about this service.*?</a>',
|
|
||||||
lambda match: '<h1>The BBC</h1>\n<p align="right"><b>%s</b></p>'%(time.strftime('%a %d %b %Y', time.localtime()),)),
|
|
||||||
# Blank line before categories
|
|
||||||
(r'<b>\s*BBC', lambda match: '<p></p><b>BBC'),
|
|
||||||
# Remove footer from individual stories
|
# Remove footer from individual stories
|
||||||
(r'<div class=.footer.>.*?Published',
|
(r'<div class=.footer.>.*?Published',
|
||||||
lambda match : '<p></p><div class="footer">Published'),
|
lambda match : '<p></p><div class="footer">Published'),
|
||||||
# Add some style info in place of disabled stylesheet
|
# Add some style info in place of disabled stylesheet
|
||||||
(r'<link.*?type=.text/css.*?>',
|
(r'<link.*?type=.text/css.*?>', lambda match :
|
||||||
'<style type="text/css">.headline {font-size: x-large;}</style>'),
|
'''<style type="text/css">
|
||||||
|
.headline {font-size: x-large;}
|
||||||
|
.ibox { padding: 10pt 10pt 10pt 10pt }
|
||||||
|
</style>'''),
|
||||||
]
|
]
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
|
|
||||||
'newsweek' : {
|
'newsweek' : {
|
||||||
'initialize' : newsweek_initialize,
|
'initialize' : newsweek_initialize,
|
||||||
'finalize' : newsweek_finalize,
|
'finalize' : newsweek_finalize,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user