mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Implement #229
This commit is contained in:
parent
4477a78a5b
commit
b1dea424fe
@ -13,12 +13,32 @@
|
|||||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
import os, time, calendar, operator
|
import os, time, calendar, operator, re
|
||||||
|
|
||||||
from libprs500 import iswindows
|
from libprs500 import iswindows
|
||||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
|
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
|
||||||
|
from htmlentitydefs import name2codepoint
|
||||||
|
|
||||||
def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10):
|
def process_html_description(tag):
|
||||||
|
src = '\n'.join(tag.contents)
|
||||||
|
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
|
||||||
|
for e in replaced_entities:
|
||||||
|
ent = '&'+e+';'
|
||||||
|
src = src.replace(ent, unichr(name2codepoint[e]))
|
||||||
|
return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
|
||||||
|
|
||||||
|
def parse_feeds(feeds, browser, print_version,
|
||||||
|
max_articles_per_feed=10,
|
||||||
|
html_description=False,
|
||||||
|
oldest_article=7):
|
||||||
|
'''
|
||||||
|
@param print_version: Callable that takes a url string and returns the url to
|
||||||
|
printable version of the article pointed to by the original url.
|
||||||
|
@param max_articles_per_feed: Maximum number of articles to download from each feed
|
||||||
|
@param html_description: If true the atricles descriptions are processed as HTML
|
||||||
|
@param oldest_article: A number in days. No articles older than now - oldest_aticle
|
||||||
|
will be downloaded.
|
||||||
|
'''
|
||||||
articles = {}
|
articles = {}
|
||||||
for title, url in feeds:
|
for title, url in feeds:
|
||||||
src = browser.open(url).read()
|
src = browser.open(url).read()
|
||||||
@ -37,10 +57,14 @@ def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10):
|
|||||||
'%a, %d %b %Y %H:%M:%S %Z')),
|
'%a, %d %b %Y %H:%M:%S %Z')),
|
||||||
'date' : pubdate
|
'date' : pubdate
|
||||||
}
|
}
|
||||||
|
delta = time.time() - d['timestamp']
|
||||||
|
if delta > oldest_article*3600*24:
|
||||||
|
continue
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
d['description'] = item.find('description').string
|
desc = item.find('description')
|
||||||
|
d['description'] = process_html_description(desc) if html_description else desc.string
|
||||||
except:
|
except:
|
||||||
d['description'] = ''
|
d['description'] = ''
|
||||||
articles[title].append(d)
|
articles[title].append(d)
|
||||||
|
@ -76,7 +76,6 @@ def initialize(profile):
|
|||||||
profile.pop('browser') # Needed as for some reason using the same browser instance causes timeouts
|
profile.pop('browser') # Needed as for some reason using the same browser instance causes timeouts
|
||||||
|
|
||||||
def finalize(profile):
|
def finalize(profile):
|
||||||
return
|
|
||||||
if os.path.isdir(profile['temp dir']):
|
if os.path.isdir(profile['temp dir']):
|
||||||
shutil.rmtree(profile['temp dir'])
|
shutil.rmtree(profile['temp dir'])
|
||||||
|
|
@ -16,9 +16,8 @@
|
|||||||
|
|
||||||
import sys, urllib2, time, re, tempfile, os, shutil
|
import sys, urllib2, time, re, tempfile, os, shutil
|
||||||
|
|
||||||
from libprs500 import __appname__, iswindows
|
from libprs500.ebooks.lrf.web import build_index, parse_feeds
|
||||||
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
|
from libprs500 import __appname__, iswindows, browser
|
||||||
from htmlentitydefs import name2codepoint
|
|
||||||
|
|
||||||
RSS_FEEDS = [
|
RSS_FEEDS = [
|
||||||
('Cover Story', 'http://feeds.newsweek.com/CoverStory'),
|
('Cover Story', 'http://feeds.newsweek.com/CoverStory'),
|
||||||
@ -34,110 +33,24 @@ RSS_FEEDS = [
|
|||||||
('Tip Sheet', 'http://feeds.newsweek.com/newsweek/TipSheet/Highlights'),
|
('Tip Sheet', 'http://feeds.newsweek.com/newsweek/TipSheet/Highlights'),
|
||||||
]
|
]
|
||||||
|
|
||||||
BASE_TEMPLATE=\
|
|
||||||
u'''
|
|
||||||
<html>
|
|
||||||
<body>
|
|
||||||
<h1>Newsweek</h1>
|
|
||||||
<b align="right">%(date)s</b>
|
|
||||||
<p></p>
|
|
||||||
<h2>Table of Contents</h2>
|
|
||||||
<ul>
|
|
||||||
%(toc)s
|
|
||||||
</ul>
|
|
||||||
<br />
|
|
||||||
<hr />
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
'''
|
|
||||||
|
|
||||||
SECTION_TEMPLATE=\
|
|
||||||
u'''
|
|
||||||
<html>
|
|
||||||
<body>
|
|
||||||
<h2>%(title)s</h2>
|
|
||||||
<p></p>
|
|
||||||
<h2>Table of Contents</h2>
|
|
||||||
<ul>
|
|
||||||
%(toc)s
|
|
||||||
</ul>
|
|
||||||
<br />
|
|
||||||
<hr />
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
'''
|
|
||||||
|
|
||||||
_tdir = None
|
|
||||||
def create_aggregator(sections):
|
|
||||||
'''Return aggregator HTML encoded in utf8'''
|
|
||||||
toc, sec = u'', 0
|
|
||||||
global _tdir
|
|
||||||
_tdir = tempfile.mkdtemp(prefix=__appname__)
|
|
||||||
for section in sections:
|
|
||||||
sec += 1
|
|
||||||
secfile = os.path.join(_tdir, 'sec%d.html'%(sec,))
|
|
||||||
title, contents = section
|
|
||||||
fix = 'file:' if iswindows else ''
|
|
||||||
toc += '<li><a href="%s">%s</a></li>\n'%(fix+secfile, title,)
|
|
||||||
stoc = u''
|
|
||||||
for item in contents:
|
|
||||||
desc = item['description'].strip()
|
|
||||||
stoc += '<li><a href="%(link)s">%(title)s</a><br />'%dict(link=item['link'], title=item['title'])
|
|
||||||
if desc:
|
|
||||||
stoc += '<div style="font-size:small; font-family:sans">%s</div>\n'%(desc,)
|
|
||||||
stoc += '</li>\n'
|
|
||||||
section = SECTION_TEMPLATE%dict(title=title, toc=stoc)
|
|
||||||
open(secfile, 'w').write(section.encode('utf8'))
|
|
||||||
index = os.path.join(_tdir, 'index.html')
|
|
||||||
src = BASE_TEMPLATE % dict(toc=toc, date=time.strftime('%d %B, %Y', time.localtime()))
|
|
||||||
open(index, 'w').write(src.encode('utf8'))
|
|
||||||
return index
|
|
||||||
|
|
||||||
def get_contents():
|
|
||||||
''' Parse Newsweek RSS feeds to get links to all articles'''
|
|
||||||
|
|
||||||
def nstounicode(ns):
|
|
||||||
return unicode(str(ns), 'utf8')
|
|
||||||
|
|
||||||
def fix_link(link):
|
|
||||||
if '?' in link:
|
|
||||||
link = link[:link.index('?')]
|
|
||||||
return link + 'print/1/displaymode/1098/'
|
|
||||||
|
|
||||||
def process_description(tag):
|
|
||||||
src = '\n'.join(tag.contents)
|
|
||||||
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
|
|
||||||
for e in replaced_entities:
|
|
||||||
ent = '&'+e+';'
|
|
||||||
src = src.replace(ent, unichr(name2codepoint[e]))
|
|
||||||
return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
|
|
||||||
|
|
||||||
pages = []
|
|
||||||
for title, url in RSS_FEEDS:
|
|
||||||
soup = BeautifulStoneSoup(urllib2.urlopen(url))
|
|
||||||
contents = []
|
|
||||||
for item in soup.findAll('item'):
|
|
||||||
d = {
|
|
||||||
'title' : nstounicode(item.title.contents[0]),
|
|
||||||
'description': process_description(item.description),
|
|
||||||
'link': fix_link(nstounicode(item.guid.contents[0]))
|
|
||||||
}
|
|
||||||
if '<' in d['description']:
|
|
||||||
d['description'] = d['description'][:d['description'].index('<')]
|
|
||||||
contents.append(d)
|
|
||||||
pages.append((title, contents))
|
|
||||||
return pages
|
|
||||||
|
|
||||||
|
def print_version(url):
|
||||||
|
if '?' in url:
|
||||||
|
url = url[:url.index('?')]
|
||||||
|
return url + 'print/1/displaymode/1098/'
|
||||||
|
|
||||||
def initialize(profile):
|
def initialize(profile):
|
||||||
print 'Fetching feeds...',
|
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
|
||||||
sys.stdout.flush()
|
profile['browser'] = browser()
|
||||||
contents = get_contents()
|
articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version,
|
||||||
print 'done'
|
max_articles_per_feed=20, html_description=True)
|
||||||
index = create_aggregator(contents)
|
index = build_index('Newsweek', articles, profile['temp dir'])
|
||||||
|
profile['url'] = 'file:'+ ('' if iswindows else '//') + index
|
||||||
|
profile['timefmt'] = ' [%d %b %Y]'
|
||||||
|
profile['max_recursions'] = 2
|
||||||
|
profile['title'] = 'Newsweek'
|
||||||
profile['url'] = 'file:'+ ('' if iswindows else '//') +index
|
profile['url'] = 'file:'+ ('' if iswindows else '//') +index
|
||||||
|
|
||||||
def finalize(profile):
|
def finalize(profile):
|
||||||
global _tdir
|
if os.path.isdir(profile['temp dir']):
|
||||||
shutil.rmtree(_tdir)
|
shutil.rmtree(profile['temp dir'])
|
||||||
|
@ -32,7 +32,7 @@ profiles = {
|
|||||||
'max_recursions' : 1, # Number of levels of links to follow
|
'max_recursions' : 1, # Number of levels of links to follow
|
||||||
'max_files' : 1000, # Maximum number of files to download
|
'max_files' : 1000, # Maximum number of files to download
|
||||||
'delay' : 0, # Delay between consecutive downloads
|
'delay' : 0, # Delay between consecutive downloads
|
||||||
'timeout' : 10, # Timeout for fetching files from server
|
'timeout' : 10, # Timeout for fetching files from server in seconds
|
||||||
'timefmt' : ' [%a %d %b %Y]',
|
'timefmt' : ' [%a %d %b %Y]',
|
||||||
'no_stylesheets' : False, # Download stylesheets
|
'no_stylesheets' : False, # Download stylesheets
|
||||||
'match_regexps' : [], # List of regular expressions that determines which links to follow
|
'match_regexps' : [], # List of regular expressions that determines which links to follow
|
||||||
@ -82,10 +82,7 @@ profiles = {
|
|||||||
'newsweek' : {
|
'newsweek' : {
|
||||||
'initialize' : newsweek_initialize,
|
'initialize' : newsweek_initialize,
|
||||||
'finalize' : newsweek_finalize,
|
'finalize' : newsweek_finalize,
|
||||||
'title' : 'Newsweek',
|
|
||||||
'timefmt' : ' [%d %b %Y]',
|
|
||||||
'no_stylesheets' : True,
|
'no_stylesheets' : True,
|
||||||
'max_recursions' : 2,
|
|
||||||
'preprocess_regexps' :
|
'preprocess_regexps' :
|
||||||
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||||
[
|
[
|
||||||
|
Loading…
x
Reference in New Issue
Block a user