mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Implement #229
This commit is contained in:
parent
4477a78a5b
commit
b1dea424fe
@ -13,12 +13,32 @@
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
import os, time, calendar, operator
|
||||
import os, time, calendar, operator, re
|
||||
|
||||
from libprs500 import iswindows
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
from htmlentitydefs import name2codepoint
|
||||
|
||||
def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10):
|
||||
def process_html_description(tag):
|
||||
src = '\n'.join(tag.contents)
|
||||
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
|
||||
for e in replaced_entities:
|
||||
ent = '&'+e+';'
|
||||
src = src.replace(ent, unichr(name2codepoint[e]))
|
||||
return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
|
||||
|
||||
def parse_feeds(feeds, browser, print_version,
|
||||
max_articles_per_feed=10,
|
||||
html_description=False,
|
||||
oldest_article=7):
|
||||
'''
|
||||
@param print_version: Callable that takes a url string and returns the url to
|
||||
printable version of the article pointed to by the original url.
|
||||
@param max_articles_per_feed: Maximum number of articles to download from each feed
|
||||
@param html_description: If true the atricles descriptions are processed as HTML
|
||||
@param oldest_article: A number in days. No articles older than now - oldest_aticle
|
||||
will be downloaded.
|
||||
'''
|
||||
articles = {}
|
||||
for title, url in feeds:
|
||||
src = browser.open(url).read()
|
||||
@ -37,10 +57,14 @@ def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10):
|
||||
'%a, %d %b %Y %H:%M:%S %Z')),
|
||||
'date' : pubdate
|
||||
}
|
||||
delta = time.time() - d['timestamp']
|
||||
if delta > oldest_article*3600*24:
|
||||
continue
|
||||
except:
|
||||
continue
|
||||
try:
|
||||
d['description'] = item.find('description').string
|
||||
desc = item.find('description')
|
||||
d['description'] = process_html_description(desc) if html_description else desc.string
|
||||
except:
|
||||
d['description'] = ''
|
||||
articles[title].append(d)
|
||||
|
@ -76,7 +76,6 @@ def initialize(profile):
|
||||
profile.pop('browser') # Needed as for some reason using the same browser instance causes timeouts
|
||||
|
||||
def finalize(profile):
|
||||
return
|
||||
if os.path.isdir(profile['temp dir']):
|
||||
shutil.rmtree(profile['temp dir'])
|
||||
|
@ -16,9 +16,8 @@
|
||||
|
||||
import sys, urllib2, time, re, tempfile, os, shutil
|
||||
|
||||
from libprs500 import __appname__, iswindows
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||
from htmlentitydefs import name2codepoint
|
||||
from libprs500.ebooks.lrf.web import build_index, parse_feeds
|
||||
from libprs500 import __appname__, iswindows, browser
|
||||
|
||||
RSS_FEEDS = [
|
||||
('Cover Story', 'http://feeds.newsweek.com/CoverStory'),
|
||||
@ -34,110 +33,24 @@ RSS_FEEDS = [
|
||||
('Tip Sheet', 'http://feeds.newsweek.com/newsweek/TipSheet/Highlights'),
|
||||
]
|
||||
|
||||
BASE_TEMPLATE=\
|
||||
u'''
|
||||
<html>
|
||||
<body>
|
||||
<h1>Newsweek</h1>
|
||||
<b align="right">%(date)s</b>
|
||||
<p></p>
|
||||
<h2>Table of Contents</h2>
|
||||
<ul>
|
||||
%(toc)s
|
||||
</ul>
|
||||
<br />
|
||||
<hr />
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
SECTION_TEMPLATE=\
|
||||
u'''
|
||||
<html>
|
||||
<body>
|
||||
<h2>%(title)s</h2>
|
||||
<p></p>
|
||||
<h2>Table of Contents</h2>
|
||||
<ul>
|
||||
%(toc)s
|
||||
</ul>
|
||||
<br />
|
||||
<hr />
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
_tdir = None
|
||||
def create_aggregator(sections):
|
||||
'''Return aggregator HTML encoded in utf8'''
|
||||
toc, sec = u'', 0
|
||||
global _tdir
|
||||
_tdir = tempfile.mkdtemp(prefix=__appname__)
|
||||
for section in sections:
|
||||
sec += 1
|
||||
secfile = os.path.join(_tdir, 'sec%d.html'%(sec,))
|
||||
title, contents = section
|
||||
fix = 'file:' if iswindows else ''
|
||||
toc += '<li><a href="%s">%s</a></li>\n'%(fix+secfile, title,)
|
||||
stoc = u''
|
||||
for item in contents:
|
||||
desc = item['description'].strip()
|
||||
stoc += '<li><a href="%(link)s">%(title)s</a><br />'%dict(link=item['link'], title=item['title'])
|
||||
if desc:
|
||||
stoc += '<div style="font-size:small; font-family:sans">%s</div>\n'%(desc,)
|
||||
stoc += '</li>\n'
|
||||
section = SECTION_TEMPLATE%dict(title=title, toc=stoc)
|
||||
open(secfile, 'w').write(section.encode('utf8'))
|
||||
index = os.path.join(_tdir, 'index.html')
|
||||
src = BASE_TEMPLATE % dict(toc=toc, date=time.strftime('%d %B, %Y', time.localtime()))
|
||||
open(index, 'w').write(src.encode('utf8'))
|
||||
return index
|
||||
|
||||
def get_contents():
|
||||
''' Parse Newsweek RSS feeds to get links to all articles'''
|
||||
|
||||
def nstounicode(ns):
|
||||
return unicode(str(ns), 'utf8')
|
||||
|
||||
def fix_link(link):
|
||||
if '?' in link:
|
||||
link = link[:link.index('?')]
|
||||
return link + 'print/1/displaymode/1098/'
|
||||
|
||||
def process_description(tag):
|
||||
src = '\n'.join(tag.contents)
|
||||
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
|
||||
for e in replaced_entities:
|
||||
ent = '&'+e+';'
|
||||
src = src.replace(ent, unichr(name2codepoint[e]))
|
||||
return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
|
||||
|
||||
pages = []
|
||||
for title, url in RSS_FEEDS:
|
||||
soup = BeautifulStoneSoup(urllib2.urlopen(url))
|
||||
contents = []
|
||||
for item in soup.findAll('item'):
|
||||
d = {
|
||||
'title' : nstounicode(item.title.contents[0]),
|
||||
'description': process_description(item.description),
|
||||
'link': fix_link(nstounicode(item.guid.contents[0]))
|
||||
}
|
||||
if '<' in d['description']:
|
||||
d['description'] = d['description'][:d['description'].index('<')]
|
||||
contents.append(d)
|
||||
pages.append((title, contents))
|
||||
return pages
|
||||
|
||||
def print_version(url):
|
||||
if '?' in url:
|
||||
url = url[:url.index('?')]
|
||||
return url + 'print/1/displaymode/1098/'
|
||||
|
||||
def initialize(profile):
|
||||
print 'Fetching feeds...',
|
||||
sys.stdout.flush()
|
||||
contents = get_contents()
|
||||
print 'done'
|
||||
index = create_aggregator(contents)
|
||||
|
||||
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
|
||||
profile['browser'] = browser()
|
||||
articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version,
|
||||
max_articles_per_feed=20, html_description=True)
|
||||
index = build_index('Newsweek', articles, profile['temp dir'])
|
||||
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
|
||||
profile['timefmt'] = ' [%d %b %Y]'
|
||||
profile['max_recursions'] = 2
|
||||
profile['title'] = 'Newsweek'
|
||||
profile['url'] = 'file:'+ ('' if iswindows else '//') +index
|
||||
|
||||
def finalize(profile):
|
||||
global _tdir
|
||||
shutil.rmtree(_tdir)
|
||||
if os.path.isdir(profile['temp dir']):
|
||||
shutil.rmtree(profile['temp dir'])
|
||||
|
@ -32,7 +32,7 @@ profiles = {
|
||||
'max_recursions' : 1, # Number of levels of links to follow
|
||||
'max_files' : 1000, # Maximum number of files to download
|
||||
'delay' : 0, # Delay between consecutive downloads
|
||||
'timeout' : 10, # Timeout for fetching files from server
|
||||
'timeout' : 10, # Timeout for fetching files from server in seconds
|
||||
'timefmt' : ' [%a %d %b %Y]',
|
||||
'no_stylesheets' : False, # Download stylesheets
|
||||
'match_regexps' : [], # List of regular expressions that determines which links to follow
|
||||
@ -82,10 +82,7 @@ profiles = {
|
||||
'newsweek' : {
|
||||
'initialize' : newsweek_initialize,
|
||||
'finalize' : newsweek_finalize,
|
||||
'title' : 'Newsweek',
|
||||
'timefmt' : ' [%d %b %Y]',
|
||||
'no_stylesheets' : True,
|
||||
'max_recursions' : 2,
|
||||
'preprocess_regexps' :
|
||||
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
[
|
||||
|
Loading…
x
Reference in New Issue
Block a user